diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 83fe9af768964003130d02b7d913ad1c2102dd1d..59661c9c1da53a2ddac0127ed1827fedde811a1d 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -31,6 +31,3 @@
     -   id: go-fmt
         types:
         - go
-    -   id: gometalinter
-        types:
-        - go
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4921226ec1c90a969fa1cfc383823820500c7757..264420ad830ed39b38f1918951d8d66c84fd5ee9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -86,6 +86,14 @@ if(ANDROID OR IOS)
         "Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
     set(WITH_MKLML OFF CACHE STRING
         "Disable MKLML package when cross-compiling for Android and iOS" FORCE)
+
+    # Compile PaddlePaddle mobile inference library
+    if (NOT WITH_C_API)
+        set(WITH_C_API ON CACHE STRING
+            "Always compile the C_API when cross-compiling for Android and iOS" FORCE)
+    endif()
+    set(MOBILE_INFERENCE ON)
+    add_definitions(-DPADDLE_MOBILE_INFERENCE)
 endif()
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -97,6 +105,12 @@ if (WITH_C_API AND WITH_PYTHON)
     "different Python interpreter from compiling.")
 endif()
 
+if(MOBILE_INFERENCE)
+    set(THIRD_PARTY_BUILD_TYPE MinSizeRel)
+else()
+    set(THIRD_PARTY_BUILD_TYPE Release)
+endif()
+
 ########################################################################################
 
 include(external/mklml)     # download mklml package
@@ -113,6 +127,7 @@ include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)    # download pybind11
+include(external/nccl)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -145,7 +160,7 @@ set(EXTERNAL_LIBS
 if(WITH_GPU)
     list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
     if(NOT WITH_DSO)
-        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+        list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
@@ -160,9 +175,11 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
-# "add_subdirectory(go)" should be placed after the following loine,
-# because it depends on paddle/optimizer.
-add_subdirectory(paddle/optimizer)
+if(NOT MOBILE_INFERENCE)
+    # "add_subdirectory(go)" should be placed after the following loine,
+    # because it depends on paddle/optimizer.
+    add_subdirectory(paddle/optimizer)
+endif()
 
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
diff --git a/Dockerfile b/Dockerfile
index 136db772cc6a24b8084120fa6bab666bc1eda78e..150344a8116e2be9b5bab8e5fdcc9c37f4025020 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -22,7 +22,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y \
-    git python-pip python-dev openssh-server bison  \
+    git python-pip python-dev openssh-server bison libnccl-dev \
     wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-matplotlib gcc-4.8 g++-4.8 \
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 51c3b918cc4ef4cf6c8052ccc14028a872309fcf..24ddb24399dabeec9b8e5faf36be3eb21f420111 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,10 @@ if(WITH_DOUBLE)
     add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
 
+if(WITH_TESTING)
+    add_definitions(-DPADDLE_WITH_TESTING)
+endif(WITH_TESTING)
+
 if(NOT WITH_TIMER)
     add_definitions(-DPADDLE_DISABLE_TIMER)
 endif(NOT WITH_TIMER)
@@ -49,19 +53,20 @@ if(NOT WITH_GOLANG)
 endif(NOT WITH_GOLANG)
 
 if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
 
     list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
+    add_definitions(-DPADDLE_WITH_CUDA)
+
     FIND_PACKAGE(CUDA REQUIRED)
 
     if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+        message(FATAL_ERROR "Paddle needs CUDA >= 7.0 to compile")
     endif()
 
     if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
+        message(FATAL_ERROR "Paddle needs cudnn to compile")
     endif()
 
     set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index f7483f6be9169eb58f0148cd3a956a8c881e1fe3..96fc886a342cae38d5b804266d3af7bc909a4da2 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -8,7 +8,7 @@ ExternalProject_Add(
     extern_eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "master"
+    GIT_TAG         70661066beef694cadf6c304d0d07e0758825c10
     PREFIX          ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 957f8271e4841836956b0c3f2cf3d8c88a31192a..c819eb4d70898e48eab499c666168d78262d4240 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -36,6 +36,7 @@ ExternalProject_Add(
     # change this back to the official Github repo once my PR is
     # merged.
     GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
+    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -45,11 +46,11 @@ ExternalProject_Add(
                     -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index b3fef738ccc0b5886bb0a32501bb7b7adade0ff1..08bdc1e1623b0d917061c7368e9b2a8f7e9517fd 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -31,6 +31,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     DEPENDS gflags
     GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_TAG         v0.3.5
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -43,12 +44,12 @@ ExternalProject_Add(
                     -DWITH_GFLAGS=ON
                     -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
                     -DBUILD_TESTING=OFF
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
                      -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 6a2a79b7631b32e8a099797de509af64533bbb95..5a4aa7a5b71a4fdfd556a46037e6d1846d668fc4 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -56,11 +56,11 @@ IF(WITH_TESTING)
                         -DBUILD_GMOCK=ON
                         -Dgtest_disable_pthreads=ON
                         -Dgtest_force_shared_crt=ON
-                        -DCMAKE_BUILD_TYPE=Release
+                        -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                         ${EXTERNAL_OPTIONAL_ARGS}
         CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
                          -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                         -DCMAKE_BUILD_TYPE:STRING=Release
+                         -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
     )
 
     ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
diff --git a/cmake/external/nccl.cmake b/cmake/external/nccl.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..57d2c0a352507afd01d1cbf2c7b23c00ff7ad81b
--- /dev/null
+++ b/cmake/external/nccl.cmake
@@ -0,0 +1,49 @@
+include(ExternalProject)
+
+set(NCCL_SOURCE_DIR ${THIRD_PARTY_PATH}/nccl)
+
+include_directories(${NCCL_SOURCE_DIR}/src/extern_nccl/src)
+
+if(WITH_DSO)
+  # If we use DSO, we do not build nccl, just download the dependencies
+  set(NCCL_BUILD_COMMAND "")
+  set(NCCL_INSTALL_COMMAND "")
+  set(NCCL_INSTALL_DIR "")
+else()
+  # otherwise, we build nccl and link it.
+  set(NCCL_INSTALL_DIR ${THIRD_PARTY_PATH}/install/nccl)
+  # Note: cuda 8.0 is needed to make nccl
+  # When cuda is not installed on the system directory, need to set CUDA_HOME to your cuda root
+  set(NCCL_BUILD_COMMAND "make -j 8")
+  set(NCCL_INSTALL_COMMAND  "make install PREFIX=${NCCL_INSTALL_DIR}")
+endif()
+
+ExternalProject_Add(
+    extern_nccl
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/NVIDIA/nccl.git"
+    GIT_TAG         "v1.3.4-1"
+    PREFIX          "${NCCL_SOURCE_DIR}"
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     "${NCCL_BUILD_COMMAND}"
+    INSTALL_COMMAND   "${NCCL_INSTALL_COMMAND}"
+    INSTALL_DIR       "${NCCL_INSTALL_DIR}"
+    TEST_COMMAND      ""
+)
+
+if(WITH_DSO)
+  if(${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_nccl_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_nccl = \"${dummyfile}\";")
+    add_library(nccl STATIC ${dummyfile})
+  else()
+    add_library(nccl INTERFACE)
+  endif()
+else()
+  add_library(nccl STATIC IMPORTED GLOBAL)
+  set_property(TARGET nccl PROPERTY IMPORTED_LOCATION
+               ${NCCL_INSTALL_DIR}/lib/libnccl_static.a)
+endif()
+
+add_dependencies(nccl extern_nccl)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 7cf7ba85cca4c248dcc74e078124c0b3815ee380..be7f6a9465970711170bd15dcecaadeaa8a55f86 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -191,12 +191,12 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             ${OPTIONAL_ARGS}
             -Dprotobuf_BUILD_TESTS=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
             -DCMAKE_INSTALL_LIBDIR=lib
         CMAKE_CACHE_ARGS
             -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
-            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
             -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
             -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
             ${OPTIONAL_CACHE_ARGS}
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index bb258c7b5581fc22b44f4fe15c119f8081f4767e..8bd058222880b4df3b08da09c02f9fe7f1d0ee66 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -35,6 +35,7 @@ ExternalProject_Add(
     extern_warpctc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    GIT_TAG         b63a0644654a3e0ed624c85a1767bc8193aead09
     PREFIX          ${WARPCTC_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
@@ -48,9 +49,9 @@ ExternalProject_Add(
                     -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
                     -DBUILD_SHARED=ON
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
-    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                      -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index c496a52b780364f3014f8fa3dfbc944a7aa7430e..e2c9fe56f335ae5b627b4d8d4bb17e4a2a466677 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -42,11 +42,11 @@ ExternalProject_Add(
                     -DBUILD_SHARED_LIBS=OFF
                     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
                     -DCMAKE_MACOSX_RPATH=ON
-                    -DCMAKE_BUILD_TYPE=Release
+                    -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
                     ${EXTERNAL_OPTIONAL_ARGS}
     CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
                      -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-                     -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
 )
 
 LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index ff9868fc4e0d970b11e4763d2e0c8581f4f85907..c311783aa3187678c31c27ddbbd074790ca444f3 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -389,13 +389,60 @@ function(go_test TARGET_NAME)
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction(go_test)
 
+# Modification of standard 'protobuf_generate_cpp()' with protobuf-lite support
+# Usage:
+#   paddle_protobuf_generate_cpp(<proto_srcs> <proto_hdrs> <proto_files>)
+
+function(paddle_protobuf_generate_cpp SRCS HDRS)
+  if(NOT ARGN)
+    message(SEND_ERROR "Error: paddle_protobuf_generate_cpp() called without any proto files")
+    return()
+  endif()
+
+  set(${SRCS})
+  set(${HDRS})
+
+  if (MOBILE_INFERENCE)
+      set(EXTRA_FLAG "lite:")  
+  else()
+      set(EXTRA_FLAG "") 
+  endif()
+
+  foreach(FIL ${ARGN})
+    get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+    get_filename_component(FIL_WE ${FIL} NAME_WE)
+    
+    set(_protobuf_protoc_src "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.cc")
+    set(_protobuf_protoc_hdr "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}.pb.h")
+    list(APPEND ${SRCS} "${_protobuf_protoc_src}")
+    list(APPEND ${HDRS} "${_protobuf_protoc_hdr}")
+    
+    add_custom_command(
+      OUTPUT "${_protobuf_protoc_src}"
+             "${_protobuf_protoc_hdr}"
+
+      COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}"
+      COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+      -I${CMAKE_CURRENT_SOURCE_DIR}
+      --cpp_out "${EXTRA_FLAG}${CMAKE_CURRENT_BINARY_DIR}" ${ABS_FIL}
+      DEPENDS ${ABS_FIL} protoc
+      COMMENT "Running C++ protocol buffer compiler on ${FIL}"
+      VERBATIM )
+  endforeach()
+
+  set_source_files_properties(${${SRCS}} ${${HDRS}} PROPERTIES GENERATED TRUE)
+  set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+  set(${HDRS} ${${HDRS}} PARENT_SCOPE)
+endfunction()
+
+
 function(proto_library TARGET_NAME)
   set(oneValueArgs "")
   set(multiValueArgs SRCS DEPS)
   cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   set(proto_srcs)
   set(proto_hdrs)
-  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  paddle_protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
   cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
 
diff --git a/cmake/util.cmake b/cmake/util.cmake
index d1aee3e170a2d143ac06b438725e907e96f041c8..117ab7f49cdf4a568cd203b2b17767643d0b2d50 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -73,25 +73,43 @@ function(link_paddle_exe TARGET_NAME)
         generate_rdma_links()
     endif()
 
-    target_circle_link_libraries(${TARGET_NAME}
-        ARCHIVE_START
-        paddle_gserver
-        paddle_function
-        ARCHIVE_END
-        paddle_pserver
-        paddle_trainer_lib
-        paddle_network
-        paddle_math
-        paddle_utils
-        paddle_parameter
-        paddle_proto
-        paddle_cuda
-        paddle_optimizer
-        ${EXTERNAL_LIBS}
-        ${CMAKE_THREAD_LIBS_INIT}
-        ${CMAKE_DL_LIBS}
-        ${RDMA_LD_FLAGS}
-        ${RDMA_LIBS})
+    if(MOBILE_INFERENCE)
+        target_circle_link_libraries(${TARGET_NAME}
+            ARCHIVE_START
+            paddle_gserver
+            paddle_function
+            ARCHIVE_END
+            paddle_math
+            paddle_utils
+            paddle_parameter
+            paddle_proto
+            paddle_cuda
+            ${EXTERNAL_LIBS}
+            ${CMAKE_THREAD_LIBS_INIT}
+            ${CMAKE_DL_LIBS}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    else()
+        target_circle_link_libraries(${TARGET_NAME}
+            ARCHIVE_START
+            paddle_gserver
+            paddle_function
+            ARCHIVE_END
+            paddle_pserver
+            paddle_trainer_lib
+            paddle_network
+            paddle_math
+            paddle_utils
+            paddle_parameter
+            paddle_proto
+            paddle_cuda
+            paddle_optimizer
+            ${EXTERNAL_LIBS}
+            ${CMAKE_THREAD_LIBS_INIT}
+            ${CMAKE_DL_LIBS}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
 
     if(ANDROID)
         target_link_libraries(${TARGET_NAME} log)
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..cf146dc088e3905a751ff55c26fd82ef0ba02c89 100644
--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
@@ -21,7 +21,7 @@ Model Config API
     trainer_config_helpers/optimizers.rst
     trainer_config_helpers/data_sources.rst
     trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/activations.rst
     trainer_config_helpers/poolings.rst
     trainer_config_helpers/networks.rst
     trainer_config_helpers/evaluators.rst
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index c94627a72806fa2eca77c79da24f7f3ca18f0259..d4e9d53e5c0955912a594fe8cd9cd41a4080a2d2 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -345,6 +345,11 @@ clip
 ..  autoclass:: paddle.v2.layer.clip
     :noindex:
 
+resize
+------
+..  autoclass:: paddle.v2.layer.resize
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst
index 6e813ab1a820d068ea3e54cad6178f1cf928eadc..048379cf01f4aec5e73e2fe3ddfa728f3c17a5d1 100644
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,3 +125,8 @@ simple_attention
     :members: simple_attention
     :noindex:
 
+dot_product_attention
+---------------------
+..  automodule:: paddle.v2.networks
+    :members: dot_product_attention
+    :noindex:
diff --git a/doc/design/block.md b/doc/design/block.md
index be8800122035984df281692fc40009c397565046..4066122c0e8dfa33776796c3d205ba5aec9e0f52 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -5,12 +5,12 @@
 Both deep learning systems and programming languages help users describe computation procedures.  These systems use various representations of computation:
 
 - Caffe, Torch, and Paddle: sequences of layers.
-- TensorFlow, Caffe2, Mxnet: graphs of operators.
+- TensorFlow, Caffe2, Mxnet: graph of operators.
 - PaddlePaddle: nested blocks, like C++ and Java programs.
 
 ## Block in Programming Languages and Deep Learning
 
-In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions, or operators.
+In programming languages, a block is a pair of curly braces that includes local variables definitions and a sequence of instructions or operators.
 
 Blocks work with control flow structures like `if`, `else`, and `for`, which have equivalents in deep learning:
 
@@ -24,14 +24,14 @@ A key difference is that a C++ program describes a one pass computation, whereas
 
 ## Stack Frames and the Scope Hierarchy
 
-The existence of the backward makes the execution of a block of traditional programs and PaddlePaddle different to each other:
+The existence of the backward pass makes the execution of a block of PaddlePaddle different from traditional programs:
 
-| programming languages | PaddlePaddle                  |
-|-----------------------|-------------------------------|
-| stack                 | scope hierarchy               |
-| stack frame           | scope                         |
-| push at entering block| push at entering block        |
-| pop at leaving block  | destroy at minibatch completes|
+| programming languages | PaddlePaddle                    |
+|-----------------------|---------------------------------|
+| stack                 | scope hierarchy                 |
+| stack frame           | scope                           |
+| push at entering block| push at entering block          |
+| pop at leaving block  | destroy when minibatch completes|
 
 1. In traditional programs:
 
@@ -42,9 +42,9 @@ The existence of the backward makes the execution of a block of traditional prog
 1. In PaddlePaddle
 
    - When the execution enters a block, PaddlePaddle adds a new scope, where it realizes variables.
-   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are to be used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
+   - PaddlePaddle doesn't pop a scope after the execution of the block because variables therein are used by the backward pass.  So it has a stack forest known as a *scope hierarchy*.
    - The height of the highest tree is the maximum depth of nested blocks.
-   - After the process of a minibatch, PaddlePaddle destroys the scope hierarchy.
+   - After the processing of a minibatch, PaddlePaddle destroys the scope hierarchy.
 
 ## Use Blocks in C++ and PaddlePaddle Programs
 
@@ -55,17 +55,23 @@ Let us consolidate the discussion by presenting some examples.
 The following C++ programs shows how blocks are used with the `if-else` structure:
 
 ```c++
+namespace pd = paddle;
+
 int x = 10;
-int y = 20;
-int out;
+int y = 1;
+int z = 10;
 bool cond = false;
+int o1, o2;
 if (cond) {
   int z = x + y;
-  out = softmax(z);
+  o1 = z;
+  o2 = pd::layer::softmax(z);
 } else {
-  int z = fc(x);
-  out = z;
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
 }
+
 ```
 
 An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator](./if_else_op.md) is as follows:
@@ -73,57 +79,55 @@ An equivalent PaddlePaddle program from the design doc of the [IfElseOp operator
 ```python
 import paddle as pd
 
-x = var(10)
-y = var(20)
-cond = var(false)
-ie = pd.create_ifelseop(inputs=[x], output_num=1)
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
 with ie.true_block():
-    x = ie.inputs(true, 0)
-    z = operator.add(x, y)
-    ie.set_output(true, 0, operator.softmax(z))
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
 with ie.false_block():
-    x = ie.inputs(false, 0)
-    z = layer.fc(x)
-    ie.set_output(true, 0, operator.softmax(z))
-out = b(cond)
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
 ```
 
-In both examples, the left branch computes `softmax(x+y)` and the right branch computes `fc(x)`.
+In both examples, the left branch computes `x+y` and `softmax(x+y)`, the right branch computes `fc(x)` and `x+1` .
+
+The difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.
 
-A difference is that variables in the C++ program contain scalar values, whereas those in the PaddlePaddle programs are mini-batches of instances.  The `ie.input(true, 0)` invocation returns instances in the 0-th input, `x`, that corresponds to true values in `cond` as the local variable `x`, where `ie.input(false, 0)` returns instances corresponding to false values.
 
 ### Blocks with `for` and `RNNOp`
 
-The following RNN model from the [RNN design doc](./rnn.md)
+The following RNN model in PaddlePaddle from the [RNN design doc](./rnn.md) :
 
 ```python
-x = sequence([10, 20, 30])
-m = var(0)
-W = tensor()
-U = tensor()
-
-rnn = create_rnn(inputs=[input])
-with rnn.stepnet() as net:
-  x = net.set_inputs(0)
-  h = net.add_memory(init=m)
-  fc_out = pd.matmul(W, x)
-  hidden_out = pd.matmul(U, h.pre(n=1))
-  sum = pd.add_two(fc_out, hidden_out)
-  act = pd.sigmoid(sum)
-  h.update(act)                       # update memory with act
-  net.set_outputs(0, act, hidden_out) # two outputs
-
+x = sequence([10, 20, 30]) # shape=[None, 1]
+m = var(0) # shape=[1]
+W = var(0.314, param=true) # shape=[1]
+U = var(0.375, param=true) # shape=[1]
+
+rnn = pd.rnn()
+with rnn.step():
+  h = rnn.memory(init = m)
+  h_prev = rnn.previous_memory(h)
+  a = layer.fc(W, x)
+  b = layer.fc(U, h_prev)  
+  s = pd.add(a, b)
+  act = pd.sigmoid(s)
+  rnn.update_memory(h, act)
+  rnn.output(a, b)
 o1, o2 = rnn()
-print o1, o2
 ```
-
 has its equivalent C++ program as follows
 
 ```c++
 int* x = {10, 20, 30};
-int m = 0;
-int W = some_value();
-int U = some_other_value();
+int* m = {0};
+int* W = {0.314};
+int* U = {0.375};
 
 int mem[sizeof(x) / sizeof(x[0]) + 1];
 int o1[sizeof(x) / sizeof(x[0]) + 1];
@@ -131,25 +135,21 @@ int o2[sizeof(x) / sizeof(x[0]) + 1];
 for (int i = 1; i <= sizeof(x)/sizeof(x[0]); ++i) {
   int x = x[i-1];
   if (i == 1) mem[0] = m;
-  int fc_out = W * x;
-  int hidden_out = Y * mem[i-1];
-  int sum = fc_out + hidden_out;
+  int a = W * x;
+  int b = Y * mem[i-1];
+  int s = fc_out + hidden_out;
   int act = sigmoid(sum);
   mem[i] = act;
   o1[i] = act;
   o2[i] = hidden_out;
 }
-
-print_array(o1);
-print_array(o2);
 ```
 
-
 ## Compilation and Execution
 
-Like TensorFlow programs, a PaddlePaddle program is written in Python.  The first part describes a neural network as a protobuf message, and the rest part executes the message for training or inference.
+Like TensorFlow, a PaddlePaddle program is written in Python. The first part describes a neural network as a protobuf message, and the rest executes the message for training or inference.
 
-The generation of this protobuf message is like what a compiler generates a binary executable file.  The execution of the message that the OS executes the binary file.
+The generation of this protobuf message is similar to how a compiler generates a binary executable file. The execution of the message is similar to how the OS executes the binary file.
 
 ## The "Binary Executable File Format"
 
@@ -186,10 +186,10 @@ Also, the RNN operator in above example is serialized into a protobuf message of
 
 ```
 OpDesc {
-  inputs = {0} // the index of x
-  outputs = {5, 3} // indices of act and hidden_out
+  inputs = {0} // the index of x in vars of BlockDesc above
+  outputs = {5, 3} // indices of act and hidden_out in vars of BlockDesc above
   attrs {
-    "memories" : {1} // the index of h
+    "states" : {1} // the index of h
     "step_net" : <above step net>
   }
 };
@@ -203,32 +203,32 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
 
 VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
 
 ```python
-a = pd.Varaible(shape=[20, 20])
+a = pd.Variable(shape=[20, 20])
 b = pd.fc(a, params=["fc.w", "fc.b"])
 
 rnn = pd.create_rnn()
-with rnn.stepnet() as net:
-    x = net.set_inputs(a)
+with rnn.stepnet():
+    x = a.as_step_input()
     # reuse fc's parameter
     fc_without_b = pd.get_variable("fc.w")
-    net.set_outputs(fc_without_b)
+    rnn.output(fc_without_b)
 
 out = rnn()
 ```
-the method `pd.get_variable` can help retrieve a Variable by a name, a Variable may store in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
+The method `pd.get_variable` can help retrieve a Variable by the name. The Variable may be stored in a parent block, but might be retrieved in a child block, so block should have a variable scope that supports inheritance.
 
 In compiler design, the symbol table is a data structure created and maintained by compilers to store information about the occurrence of various entities such as variable names, function names, classes, etc.
 
 To store the definition of variables and operators, we define a C++ class `SymbolTable`, like the one used in compilers.
 
-`SymbolTable` can do the following stuff:
+`SymbolTable` can do the following:
 
 - store the definitions (some names and attributes) of variables and operators,
-- to verify if a variable was declared,
-- to make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
+- verify if a variable was declared,
+- make it possible to implement type checking (offer Protobuf message pointers to `InferShape` handlers).
 
 
 ```c++
@@ -240,19 +240,18 @@ class SymbolTable {
 
   OpDesc* NewOp(const string& name="");
 
-  // TODO determine whether name is generated by python or C++
-  // currently assume that a unique name will be generated by C++ if the
-  // argument name left default.
-  VarDesc* NewVar(const string& name="");
+  // TODO determine whether name is generated by python or C++.
+  // Currently assume that a unique name will be generated by C++ if the
+  // argument name is left default.
+  VarDesc* Var(const string& name="");
 
-  // find a VarDesc by name, if recursive true, find parent's SymbolTable
+  // find a VarDesc by name, if recursive is true, find parent's SymbolTable
   // recursively.
   // this interface is introduced to support InferShape, find protobuf messages
   // of variables and operators, pass pointers into InferShape.
-  // operator
   //
   // NOTE maybe some C++ classes such as VarDescBuilder and OpDescBuilder should
-  // be proposed and embedded into pybind to enable python operate on C++ pointers.
+  // be proposed and embedded into pybind to enable python operation on C++ pointers.
   VarDesc* FindVar(const string& name, bool recursive=true);
 
   OpDesc* FindOp(const string& name);
@@ -270,7 +269,7 @@ class SymbolTable {
 After all the description of variables and operators is added into SymbolTable,
 the block has enough information to run.
 
-The `Block` class takes a `BlockDesc` as input, and provide `Run` and `InferShape` functions.
+The `Block` class takes a `BlockDesc` as input, and provides `Run` and `InferShape` functions.
 
 
 ```c++
@@ -302,7 +301,7 @@ public:
   void CreateVariables(const framework::Scope& scope);
   void CreateOperators();
 
-  // some other necessary interfaces of NetOp are list below
+  // some other necessary interfaces of NetOp are listed below
   // ...
 
 private:
@@ -316,15 +315,14 @@ private:
 Block inherits from OperatorBase, which has a Run method.
 Block's Run method will run its operators sequentially.
 
-There is another important interface called `Eval`, which take some arguments called targets, and generate a minimal graph which takes targets as the end points and creates a new Block,
-after `Run`, `Eval` will get the latest value and return the targets.
+There is another important interface called `Eval`, which takes some arguments called targets and generates a minimal graph which treats targets as the end points and creates a new Block. After `Run`, `Eval` will get the latest value and return the targets.
 
 The definition of Eval is as follows:
 
 ```c++
 // clean a block description by targets using the corresponding dependency graph.
 // return a new BlockDesc with minimal number of operators.
-// NOTE not return a Block but the block's description so that this can be distributed
+// NOTE: The return type is not a Block but the block's description so that this can be distributed
 // to a cluster.
 BlockDesc Prune(const BlockDesc& desc, vector<string> targets);
 
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle
index 42384a3f059966e22e22f5fa4295cc9ead5cef83..43415ed8cf61a5acfa34f8e56b9577f338dbf254 100644
Binary files a/doc/design/cluster_train/src/trainer.graffle and b/doc/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/design/dcgan.png b/doc/design/dcgan.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e8e290a111ff43900934341365cb4360d87d28
Binary files /dev/null and b/doc/design/dcgan.png differ
diff --git a/doc/design/executor.md b/doc/design/executor.md
new file mode 100644
index 0000000000000000000000000000000000000000..b5fb6c5c3c1da3c112ce63878322083dd5c42b70
--- /dev/null
+++ b/doc/design/executor.md
@@ -0,0 +1,23 @@
+# Executor Design Doc
+
+## Motivation
+
+We use executor to do the runtime evaluation of a `ProgramDesc`.
+
+## Overview
+
+An executor takes a `ProgramDesc`, a `block_id` and a `Scope`.  The `ProgramDesc` is a list of blocks and each block contains the protobuf definition of all the parameters and operators. The `block_id` specifies the entrance block. And the `Scope` is the container of all the variable instance, which is persistent throughout different runs.
+
+### What does executor do?
+
+It evaluates all the operators in the `block_id`th block of a `ProgramDesc`.
+
+### What does executor NOT do?
+
+It does not do runtime optimization, meaning intelligently parse the dependency of each op a choose which one to be run and in which order they should be run.
+
+It does not do graph partitioning, meaning dividing the `ProgramDesc` into several small pieces and executing them on different devices.
+
+## Implementation
+
+`Executor` evaluates a `ProgramDesc`. Essentially, it instantiates Variables and Operators, then run all the operators in sequence. [[code]](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)
diff --git a/doc/design/gan_api.md b/doc/design/gan_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..fb41df8615f73d9fd4c32995eab265833eac1a55
--- /dev/null
+++ b/doc/design/gan_api.md
@@ -0,0 +1,253 @@
+# Design for GAN
+
+GAN (General Adversarial Net [https://arxiv.org/abs/1406.2661]) is an important model for unsupervised learning and widely used in many areas. 
+
+It applies several important concepts in machine learning system design, including building and running subgraphs, dependency tracing, different optimizers in one executor and so forth.
+
+In our GAN design, we wrap it as a user-friendly easily customized python API to design different models. We take the conditional DC-GAN (Unsupervised Representation Learning with Deep Convolutional Generative Adversarial Networks [https://arxiv.org/abs/1511.06434]) as an example due to its good performance on image generation.
+
+<p align="center">
+<img src="./test.dot.png" width = "35%" align="center"/><br/>
+Figure 1. The overall running logic of GAN. The black solid arrows indicate the forward pass; the green dashed arrows indicate the backward pass of generator training; the red dashed arrows indicate the backward pass of the discriminator training. The BP pass of the green (red) arrow should only update the parameters in the green (red) boxes. The diamonds indicate the data providers. d\_loss and g\_loss marked in red and green are the two targets we would like to run.
+</p>
+
+The operators, layers and functions required/optional to build a GAN demo is summarized in https://github.com/PaddlePaddle/Paddle/issues/4563.
+
+<p align="center">
+<img src="./dcgan.png" width = "90%" align="center"/><br/>
+Figure 2. Photo borrowed from the original DC-GAN paper.
+</p>
+
+## The Conditional-GAN might be a class. 
+This design we adopt the popular open source design in https://github.com/carpedm20/DCGAN-tensorflow and https://github.com/rajathkmp/DCGAN. It contains following data structure:
+
+- DCGAN(object): which contains everything required to build a GAN model. It provides following member functions methods as API:
+
+- __init__(...): Initialize hyper-parameters (like conv dimension and so forth), and declare model parameters of discriminator and generator as well.
+
+- generator(z, y=None): Generate a fake image from input noise z. If the label y is provided, the conditional GAN model will be chosen.
+Returns a generated image.
+
+- discriminator(image):
+Given an image, decide if it is from a real source or a fake one. 
+Returns a 0/1 binary label.
+
+- build_model(self):
+build the whole GAN model, define training loss for both generator and discrimator.
+
+## Discussion on Engine Functions required to build GAN
+- Trace the tensor and variable dependency in the engine executor. (Very critical, otherwise GAN can'be be trained correctly)
+- Different optimizers responsible for optimizing different loss.
+
+To be more detailed, we introduce our design of DCGAN as following:
+
+### Class member Function: Initializer
+- Set up hyper-parameters, including condtional dimension, noise dimension, batch size and so forth.
+- Declare and define all the model variables. All the discriminator parameters are included in the list self.theta_D and all the generator parameters are included in the list self.theta_G.
+```python
+class DCGAN(object):
+  def __init__(self, y_dim=None):
+  
+    # hyper parameters  
+    self.y_dim = y_dim # conditional gan or not
+    self.batch_size = 100
+    self.z_dim = z_dim # input noise dimension
+
+    # define parameters of discriminators
+    self.D_W0 = pd.Variable(shape=[3,3, 1, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.D_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.D_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.D_b2 = pd.Variable(np.zeros(128))
+    self.theta_D = [self.D_W0, self.D_b0, self.D_W1, self.D_b1, self.D_W2, self.D_b2]
+
+    # define parameters of generators
+    self.G_W0 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b0 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W1 = pd.Variable(shape=[784, 128], data=pd.gaussian_normal_randomizer())
+    self.G_b1 = pd.Variable(np.zeros(128)) # variable also support initialization using a  numpy data
+    self.G_W2 = pd.Varialble(np.random.rand(128, 1))
+    self.G_b2 = pd.Variable(np.zeros(128))
+    self.theta_G = [self.G_W0, self.G_b0, self.G_W1, self.G_b1, self.G_W2, self.G_b2]
+```
+
+### Class member Function: Generator
+- Given a noisy input z, returns a fake image.
+- Concatenation, batch-norm, FC operations required;
+- Deconv layer required, which is missing now...
+```python
+class DCGAN(object):
+  def generator(self, z, y = None):
+    # input z: the random noise
+    # input y: input data label (optional)
+    # output G_im: generated fake images
+    
+    if not self.y_dim:
+      z = pd.layer.concat(1, [z, y])
+      
+    G_h0 = pd.layer.fc(z, self.G_w0, self.G_b0)
+    G_h0_bn = pd.layer.batch_norm(G_h0)
+    G_h0_relu = pd.layer.relu(G_h0_bn)
+    
+    G_h1 = pd.layer.deconv(G_h0_relu, self.G_w1, self.G_b1)
+    G_h1_bn = pd.layer.batch_norm(G_h1)
+    G_h1_relu = pd.layer.relu(G_h1_bn)
+    
+    G_h2 = pd.layer.deconv(G_h1_relu, self.G_W2, self.G_b2))
+    G_im = pd.layer.tanh(G_im)
+    return G_im
+```
+
+### Class member function: Discriminator
+- Given a noisy input z, returns a fake image.
+- Concatenation, Convolution, batch-norm, FC, Leaky-ReLU operations required;
+```python
+class DCGAN(object):
+  def discriminator(self, image):
+    # input image: either generated images or real ones
+    # output D_h2: binary logit of the label
+
+    D_h0 = pd.layer.conv2d(image, w=self.D_w0, b=self.D_b0)
+    D_h0_bn = pd.layer.batchnorm(h0)
+    D_h0_relu = pd.layer.lrelu(h0_bn)
+    
+    D_h1 = pd.layer.conv2d(D_h0_relu, w=self.D_w1, b=self.D_b1)
+    D_h1_bn = pd.layer.batchnorm(D_h1)
+    D_h1_relu = pd.layer.lrelu(D_h1_bn)
+    
+    D_h2 = pd.layer.fc(D_h1_relu, w=self.D_w2, b=self.D_b2)
+    return D_h2
+```
+
+### Class member function: Build the model
+- Define data readers as placeholders to hold the data;
+- Build generator and discriminators;
+- Define two training losses for discriminator and generator, respectively. 
+If we have execution dependency engine to back-trace all tensors, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self):
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+    
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_t = self.discriminator(self.images)
+        # generated fake images
+        self.sampled = self.sampler(self.z, self.y)
+        self.D_f = self.discriminator(self.G)
+    else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_t = self.discriminator(self.images)
+        # generate fake images
+        self.sampled = self.sampler(self.z)
+        self.D_f = self.discriminator(self.images)
+    
+    # step 2: define the two losses
+    self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+    self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+    self.d_loss = self.d_loss_real + self.d_loss_fake
+    
+    self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_f, np.ones(self.batch_szie))
+```
+
+If we do not have dependency engine but blocks, the module building our GAN model will be like this:
+```python
+class DCGAN(object):
+  def build_model(self, default_block):
+    # input data in the default block
+    if self.y_dim:
+        self.y = pd.data(pd.float32, [self.batch_size, self.y_dim])
+    self.images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    # self.faked_images = pd.data(pd.float32, [self.batch_size, self.im_size, self.im_size])
+    self.z = pd.data(tf.float32, [None, self.z_size])
+
+    # step 1: generate images by generator, classify real/fake images with discriminator
+    with pd.default_block().g_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.G = self.generator(self.z, self.y)
+        self.D_g = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.G = self.generator(self.z)
+        self.D_g = self.discriminator(self.G, self.y)
+      self.g_loss = pd.reduce_mean(pd.cross_entropy(self.D_g, np.ones(self.batch_szie))
+    
+    with pd.default_block().d_block():
+      if self.y_dim: # if conditional GAN, includes label
+        self.D_t = self.discriminator(self.images, self.y)
+        self.D_f = self.discriminator(self.G, self.y)
+      else: # original version of GAN
+        self.D_t = self.discriminator(self.images)
+        self.D_f = self.discriminator(self.G)
+
+      # step 2: define the two losses
+      self.d_loss_real = pd.reduce_mean(pd.cross_entropy(self.D_t, np.ones(self.batch_size))
+      self.d_loss_fake = pd.reduce_mean(pd.cross_entropy(self.D_f, np.zeros(self.batch_size))
+      self.d_loss = self.d_loss_real + self.d_loss_fake
+```
+Some small confusion and problems with this design:
+- D\_g and D\_f are actually the same thing, but has to be written twice; i.e., if we want to run two sub-graphs conceptually, the same codes have to be written twice if they are shared by the graph.
+- Requires ability to create a block anytime, rather than in if-else or rnn only;
+
+## Main function for the demo:
+Generally, the user of GAN just need to the following things:
+- Define an object as DCGAN class;
+- Build the DCGAN model;
+- Specify two optimizers for two different losses with respect to different parameters.
+```python
+# pd for short, should be more concise.
+from paddle.v2 as pd
+import numpy as np
+import logging
+
+if __name__ == "__main__":
+    # dcgan class in the default graph/block
+    # if we use dependency engine as tensorflow
+    # the codes, will be slightly different like:
+    # dcgan = DCGAN()
+    # dcgan.build_model()
+    with pd.block() as def_block:
+      dcgan = DCGAN()
+      dcgan.build_model(def_block)
+
+    # load mnist data
+    data_X, data_y = self.load_mnist()
+    
+    # Two subgraphs required!!!
+    with pd.block().d_block():
+      d_optim = pd.train.Adam(lr = .001, beta= .1)
+      d_step = d_optim.minimize(dcgan.d_loss, dcgan.theta_D)
+    with pd.block.g_block():
+      g_optim = pd.train.Adam(lr = .001, beta= .1)
+      g_step = pd.minimize(dcgan.g_loss, dcgan.theta_G)
+
+    # executor
+    sess = pd.executor()
+    
+    # training
+    for epoch in xrange(10000):
+      for batch_id in range(N / batch_size):
+        idx = ...
+        # sample a batch
+        batch_im, batch_label = data_X[idx:idx+batch_size], data_y[idx:idx+batch_size]
+        # sample z
+        batch_z = np.random.uniform(-1., 1., [batch_size, z_dim])
+
+        if batch_id % 2 == 0:
+          sess.run(d_step, 
+                   feed_dict = {dcgan.images: batch_im,
+                                dcgan.y: batch_label,
+                                dcgan.z: batch_z})
+        else:
+          sess.run(g_step,
+                   feed_dict = {dcgan.z: batch_z})
+```
+
+# More thinking about dependency engine v.s. block design:
+- What if we just want to run an intermediate result? Do we need to run the whole block/graph?
+- Should we call eval() to get the fake images in the first stage? And then train the discriminator in the second stage?
diff --git a/doc/design/graph_survey.md b/doc/design/graph_survey.md
new file mode 100644
index 0000000000000000000000000000000000000000..6c6db08f463ae0a2b94fc4546f123a1d7c151870
--- /dev/null
+++ b/doc/design/graph_survey.md
@@ -0,0 +1,232 @@
+## Survey on Graph
+
+Neural network framework often provides symbolic API for users to write network topology conveniently. This doc manily focus on symbolic API in most popular neural network frameworks, and try to find out how to parse symbolic configuration to a portable file, such as protobuf or json.
+
+### Mxnet
+
+The core concept of symbolic API is `Symbol`. Mxnet implements `Symbol` class in C++, and export to Python using C-API. Please refer to the comments in Mxnet:
+
+
+`Symbol` is help class used to represent the operator node in Graph.
+`Symbol` acts as an interface for building graphs from different components like Variable, Functor and Group. `Symbol` is also exported to python front-end (while Graph is not) to enable quick test and deployment. Conceptually, symbol is the final operation of a graph and thus including all the information required (the graph) to evaluate its output value.
+
+
+A simple network topology wrote by Symbol is as follows:
+
+```python
+def get_symbol(num_classes=10, **kwargs):
+    data = mx.symbol.Variable('data')
+    data = mx.symbol.Flatten(data=data)
+    fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+    act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
+    fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
+    act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
+    fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=num_classes)
+    mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
+    return mlp
+```
+
+
+
+Varible here is actually a Symbol. Every basic Symbol will correspond to one Node, and every Node has its own NodeAttr. There is a op field in NodeAttr class, when a Symbol represents Variable(often input data), the op field is null.
+
+Symbol contains a data member, std::vector<NodeEntry> outputs, and NodeEntry cantains a poniter to Node. We can follow the Node pointer to get all the Graph.
+
+And Symbol can be saved to a Json file.
+
+Here is a detailed example:
+
+```
+>>> import mxnet as mx
+>>> data = mx.symbol.Variable('data')
+>>> print data.debug_str()
+Variable:data
+
+>>> data = mx.symbol.Flatten(data=data)
+>>> print data.debug_str()
+Symbol Outputs:
+	output[0]=flatten0(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+
+>>> fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
+>>> print fc1.debug_str()
+Symbol Outputs:
+	output[0]=fc1(0)
+Variable:data
+--------------------
+Op:Flatten, Name=flatten0
+Inputs:
+	arg[0]=data(0) version=0
+Variable:fc1_weight
+Variable:fc1_bias
+--------------------
+Op:FullyConnected, Name=fc1
+Inputs:
+	arg[0]=flatten0(0)
+	arg[1]=fc1_weight(0) version=0
+	arg[2]=fc1_bias(0) version=0
+Attrs:
+	num_hidden=128
+
+```
+
+
+### TensorFlow
+
+
+The core concept of symbolic API is `Tensor`. Tensorflow defines `Tensor` in Python. Please refer to the comments in TensorFlow:
+
+A `Tensor` is a symbolic handle to one of the outputs of an `Operation`. It does not hold the values of that operation's output, but instead provides a means of computing those values in a TensorFlow [Session](https://www.tensorflow.org/api_docs/python/tf/Session).
+
+A simple example is as follows:
+
+```python
+  # Build a dataflow graph.
+  c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+  e = tf.matmul(c, d)
+
+  # Construct a `Session` to execute the graph.
+  sess = tf.Session()
+
+  # Execute the graph and store the value that `e` represents in `result`.
+  result = sess.run(e)
+```
+
+  
+The main method of `Tensor` is as follows: 
+ 
+ 
+```python
+@property
+def op(self):
+  """The `Operation` that produces this tensor as an output."""
+  return self._op
+
+@property
+def dtype(self):
+   """The `DType` of elements in this tensor."""
+  return self._dtype
+
+@property
+def graph(self):
+  """The `Graph` that contains this tensor."""
+  return self._op.graph
+
+@property
+def name(self):
+  """The string name of this tensor."""
+  if not self._op.name:
+    raise ValueError("Operation was not named: %s" % self._op)
+  return "%s:%d" % (self._op.name, self._value_index)
+
+@property
+def device(self):
+  """The name of the device on which this tensor will be produced, or None."""
+  return self._op.device
+```
+
+
+Tensor can be taken as target to run by session. Tensor contains all the information of Graph, and tracks data dependency.
+
+
+Here is a detailed example:
+
+
+```
+>>> import tensorflow as tf
+>>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+>>> print c.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+>>> print d.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+>>> e = tf.matmul(c, d)
+>>> print e.graph
+<tensorflow.python.framework.ops.Graph object at 0x10f256d50>
+```
+
+### Dynet
+
+
+The core concept of symbolic API is `Expression`, and Dynet defines `Expression` class in C++.
+
+
+A simple example is as follows:
+
+```cpp
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+
+Expression in = input(cg, xs[i]);
+Expression label = input(cg, ys[i]);
+Expression pred = W * in;
+Expression loss = square(pred - label);
+```
+
+The input data and parameter are also represented by Expression. Every basci Expression corresponds to a Node. And input data is also a Node. 
+
+Expression has a data member ComputationGraph, and ComputationGraph will be modified in users' configuring process. Expression can be a running target, beacuse Expression contains all dependency.
+
+
+Here is a detailed example:
+
+write topology in C++
+
+```
+ComputationGraph cg;
+Expression W = parameter(cg, pW);
+cg.print_graphviz();
+
+Expression pred = W * xs[i];
+cg.print_graphviz();
+
+Expression loss = square(pred - ys[i]);
+cg.print_graphviz();
+```
+
+compile and print
+
+```
+# first print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+}
+# second print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+}
+# third print
+digraph G {
+  rankdir=LR;
+  nodesep=.05;
+  N0 [label="v0 = parameters({1}) @ 0x7ffe4de00110"];
+  N1 [label="v1 = v0 * -0.98"];
+  N0 -> N1;
+  N2 [label="v2 = -1.88387 - v1"];
+  N1 -> N2;
+  N3 [label="v3 = -v2"];
+  N2 -> N3;
+  N4 [label="v4 = square(v3)"];
+  N3 -> N4;
+}
+```
+
+### Conclusion
+
+
+Actually, Symbol/Tensor/Expression in Mxnet/TensorFlow/Dynet are the same level concepts. We use a unified name Expression here, this level concept has following features:
+
+- Users wirte topoloy with symbolic API, and all return value is Expression, including input data and parameter.
+- Expression corresponds with a global Graph, and Expression can also be composed.
+- Expression tracks all dependency and can be taken as a run target
diff --git a/doc/design/if_else_op.md b/doc/design/if_else_op.md
index 954a19c0733358c235eae3cffe134c23dac94c95..26d140f06db4ecefa86be015eaa731ffddc6910c 100644
--- a/doc/design/if_else_op.md
+++ b/doc/design/if_else_op.md
@@ -1,41 +1,51 @@
-IfOp should have only one branch. An IfOp operator takes a `cond` variable whose value must be a vector of N boolean elements. Its return value has N instances. If cond[i] == True, input instance input[i] will go through true_block() and generate output[i]; otherwise it will produce output from false_bloack().
+# The `IfElse` Operator
 
-```python
-import paddle as pd
+PaddlePaddle's `IfElse` operator differs from TensorFlow's:
 
-x = var()
-y = var()
-cond = var()
-default_value = var()
-b = pd.create_ifelseop(inputs=[x], output_num=1)
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
-
-with b.false_block():
-    x = b.inputs(0)
-    z = layer.fc(x)
-    b.set_output(0, operator.softmax(z))
-
-out = b(cond)
-```
+- the TensorFlow version takes a scalar boolean value as the condition so that the whole mini-batch goes to either the true or the false branch, whereas
+- the PaddlePaddle version takes a vector of boolean value as the condition, and instances corresponding to true values go to the true branch, those corresponding to false values go to the false branch.
+
+## Example
+
+The following PaddlePaddle program shows the usage of the IfElse operator:
 
-If only true_block is set in an IfElseOp, a special case is that we can have a default value for false as:
 ```python
 import paddle as pd
 
-x = var()
-y = var()
-cond = var()
-default_value = var()
-b = pd.create_ifelseop(inputs=[x], output_num=1, default_value)
-
-with b.true_block():
-    x = b.inputs(0)
-    z = operator.add(x, y)
-    b.set_output(0, operator.softmax(z))
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
+
+ie = pd.ifelse()
+with ie.true_block():
+    d = pd.layer.add(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
 
-out = b(cond)
+A challenge to implement the `IfElse` operator is to infer those variables to be split, or, say, to identify the variable of the mini-batch or those derived from the mini-batch.
+
+An equivalent C++ program is as follows:
+
+```c++
+namespace pd = paddle;
+
+int x = 10;
+int y = 1;
+int z = 10;
+bool cond = false;
+int o1, o2;
+if (cond) {
+  int d = x + y;
+  o1 = z;
+  o2 = pd::layer::softmax(z);
+} else {
+  int d = pd::layer::fc(z);
+  o1 = d;
+  o2 = d+1;
+}
 ```
-where default_value is a list of vars for `cond` == False.
diff --git a/doc/design/images/feed_forward.png b/doc/design/images/feed_forward.png
new file mode 100644
index 0000000000000000000000000000000000000000..d312371a04c26aa6cd196e0bd1f51becb425180b
Binary files /dev/null and b/doc/design/images/feed_forward.png differ
diff --git a/doc/design/images/feed_forward_regularized.png b/doc/design/images/feed_forward_regularized.png
new file mode 100644
index 0000000000000000000000000000000000000000..677e99bfd9f8e72ed9fe4b27127af2ced202f447
Binary files /dev/null and b/doc/design/images/feed_forward_regularized.png differ
diff --git a/doc/design/images/graph_construction_example.dot b/doc/design/images/graph_construction_example.dot
index 8d1b673abf6b78c851676fa379dc850c4818f0e5..e115f9844bae6ad24f638c8ed4749cea8aff06a9 100644
--- a/doc/design/images/graph_construction_example.dot
+++ b/doc/design/images/graph_construction_example.dot
@@ -33,7 +33,6 @@ digraph ImageClassificationGraph {
 
         cost -> MSE_Grad [color=red];
         d_cost -> MSE_Grad [color=red];
-        x -> MSE_Grad [color=red];
         l -> MSE_Grad [color=red];
         y -> MSE_Grad -> d_y [color=red];
 
diff --git a/doc/design/images/graph_construction_example_all.png b/doc/design/images/graph_construction_example_all.png
index 181187503472d15779b87284105841168b3945c4..261611a5721f9aa97874f7e6d897fe48cf667db2 100644
Binary files a/doc/design/images/graph_construction_example_all.png and b/doc/design/images/graph_construction_example_all.png differ
diff --git a/doc/design/images/graph_construction_example_forward_backward.png b/doc/design/images/graph_construction_example_forward_backward.png
index 3049a9315fd616464dec54e33064cb75598ca536..4c69687f4a6a181138f3df72ce5e8aa48487b5be 100644
Binary files a/doc/design/images/graph_construction_example_forward_backward.png and b/doc/design/images/graph_construction_example_forward_backward.png differ
diff --git a/doc/design/images/graph_construction_example_forward_only.png b/doc/design/images/graph_construction_example_forward_only.png
index 25d19088cbf0b5f68cf734f2ff21eba8af4a2860..e668c16e0cac73acb4e5dc2b1827557ae77126b4 100644
Binary files a/doc/design/images/graph_construction_example_forward_only.png and b/doc/design/images/graph_construction_example_forward_only.png differ
diff --git a/doc/design/images/l1_regularization.png b/doc/design/images/l1_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..e1b9c7a44f94dc027598a98da93ddb8133190972
Binary files /dev/null and b/doc/design/images/l1_regularization.png differ
diff --git a/doc/design/images/l2_regularization.png b/doc/design/images/l2_regularization.png
new file mode 100644
index 0000000000000000000000000000000000000000..d5c2fcbc2ccae75ad083162e5a2dceb0210be298
Binary files /dev/null and b/doc/design/images/l2_regularization.png differ
diff --git a/doc/design/images/loss_equation.png b/doc/design/images/loss_equation.png
new file mode 100644
index 0000000000000000000000000000000000000000..14212ec8d36c803de96bde8a9a4b5591bd20434e
Binary files /dev/null and b/doc/design/images/loss_equation.png differ
diff --git a/doc/design/infer_var_type.md b/doc/design/infer_var_type.md
new file mode 100644
index 0000000000000000000000000000000000000000..d9d5397becba2ef1806d9341cd49cd9aabbf4a6a
--- /dev/null
+++ b/doc/design/infer_var_type.md
@@ -0,0 +1,78 @@
+# Design Doc: InferVarType
+
+## The Problem Posed
+
+The variable in our design can hold variant types. Such as `LoDTensor` and `SelectedRows`. An operator should be able to inference the variable types of its output.
+
+For example, a `lookup table` operator takes two `LoDTensor`; one is a float tensor as the embedding table, the other is an int tensor as word ID. The gradient operator of `lookup table` will generate a `SelectedRows` as its output. A `sum` operator can take both `LoDTensor` and `SelectedRows` as its inputs and will generate a `LoDTensor` if any of its inputs is `LoDTensor`, otherwise, the `sum` operator will generate `SelectedRows` as its output.
+
+The variable type will be constant at runtime. Every variable's type can either be set by the user (input data and parameter) or be inferred by the operator in compile time.
+
+## Proposed Solution
+
+The `InferVarType` is a compile-time function which is registered to each operator. The inferface of that function is:
+
+
+```c++
+using InferVarTypeFN = std::function<
+    void (const OpDescBind& /*op_desc*/, BlockDescBind* /*block*/)>;
+```
+
+It takes an operator description as its input and will write the output variable type and store them in block description.
+
+The `InferVarTypeFN` will be registered in `OpInfo`, to replace `infer_var_type_` field. The `OpInfo` should be
+
+```cpp
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  ...
+};
+```
+
+The default `InferVarType` will set output type as `LoDTensor`. It can be done by `GetInferVarType()`.
+
+```cpp
+void DefaultInferVarType(const OpDescBind& op_desc, BlockDescBind* block) {
+  // set the output type of variable as `LoDTensor`.
+  // ...
+}
+
+struct OpInfo {
+  InferVarTypeFN infer_var_type_;
+  InferVarTypeFN GetInferVarType() const {
+    if (infer_var_type_) {
+      return infer_var_type_;
+    } else {
+      return DefaultInferVarType;
+    }
+  }
+};
+```
+
+## Register InferVarType
+
+We provide a thin base class for registering an `InferVarTypeFN`. To use a base class will ease the implementation of registry since we can detect the registry entry is an `InferVarTypeFN` or not.
+
+```cpp
+class VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const = 0;
+}
+```
+
+Operator developers can write the specialize `VarTypeInferer` as follow.
+
+```cpp
+class SpecialVarTypeInferer : public VarTypeInferer {
+public:
+  virtual void operator()(const OpDescBind& op_desc, BlockDescBind* block) const {
+    // .. own logic
+  }
+}
+```
+
+Then user can register the `InferVarType` just like `GradOpDescMaker` and `OpInfoMaker`.
+
+```
+REGISTER_OPERATOR(some_op, OpType, SpecialVarTypeInferer, ...);
+```
diff --git a/doc/design/model_format.md b/doc/design/model_format.md
new file mode 100644
index 0000000000000000000000000000000000000000..db8c36e5f5dca94b516aad2134c1bdc8ccc6c744
--- /dev/null
+++ b/doc/design/model_format.md
@@ -0,0 +1,36 @@
+# Design Doc: Model Format
+
+## Motivation
+
+The model is the output of training process. One complete model consists of two parts, namely, the **topology** and the **parameters**. To support industrial deployment, we need to make the model format must be self-completed and do not expose any training source code.
+
+As a result, In PaddlePaddle, the **topology** represents as a  [ProgramDesc](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/doc/design/program.md), which describes the model structure. The **parameters** contain all the trainable weights in the model, we must support large size parameter, and efficient serialization/deserialization. 
+
+## Implementation
+
+The topology is saved as a plain text, in detail, a self-contain protobuf file. 
+
+The parameters are saved as a binary file. As we all know, the protobuf message has the limits of [64M size](https://developers.google.com/protocol-buffers/docs/reference/cpp/google.protobuf.io.coded_stream#CodedInputStream.SetTotalBytesLimit.details). We do a (benchmark experiment)[https://github.com/PaddlePaddle/Paddle/pull/4610], its result shows protobuf is not fit in this scene.
+
+As a result, we design a particular format for tensor serialization. By default, arbitrary tensor in Paddle is a [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md), and has a description information proto of (LoDTensorDesc)[https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L99]. We save the DescProto as the byte string header, it contains the necessary information, such as the `dims`, the `name` of the tensor, and the `LoD` information in [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/1c0a4c901c9fc881d120249c703b15d1c50dae7d/paddle/framework/lod_tensor.md). Tensor stores value in a continuous memory buffer, for speed we dump the raw memory to disk and save it as the byte string content. So, the binary format of one tensor is, 
+
+|HeaderLength|ContentLength|**LoDTensorDesc**|**TensorValue**|
+
+In detail, tensor's  byte view as the table shows. Note that all the signed value written in little-endian.
+
+```text
+[offset] [type]              [description] 
+0004     4 bytes integer      HeaderLength, the length of LoDTensorDesc
+0008     4 bytes integer      ContentLength, the length of LodTensor Buffer
+0009     1 bytes char         TensorDesc
+00010    1 bytes char         TensorDesc
+...
+00100    1 bytes char         TensorValue
+00101    1 bytes char         TensorValue
+00102    1 bytes char         TensorValue              ..
+...
+```
+
+## Summary
+
+We introduce the model format, the `ProgramDesc` describe the **topology**, and a bunch of particular format binary tensors describes the **parameters**.
diff --git a/doc/design/optimizer.md b/doc/design/optimizer.md
new file mode 100644
index 0000000000000000000000000000000000000000..202b4b65103c0b7c536a9cb466c4120ce134d8c3
--- /dev/null
+++ b/doc/design/optimizer.md
@@ -0,0 +1,91 @@
+## Optimizer Design
+
+### The Problem
+
+A PaddlePaddle program, or a block, is a sequence of operators operating variables.  A training program needs to do three kinds of works:
+
+1. the forward pass, which computes intermediate results and the cost(s),
+1. the backward pass, which derives gradients from intermediate results and costs, and
+1. the optimization pass, which update model parameters to optimize the cost(s).
+
+These works rely on three kinds of operators:
+
+1. forward operators,
+1. gradient operators, and
+1. optimization operators.
+
+It's true that users should be able to create all these operators manually by calling some low-level API, but it would be much more convenient if they could only describe the forward pass and let PaddlePaddle create the backward and optimization operators automatically.
+
+In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
+
+
+### High-level Python API to describe the training process
+
+1. User write code to describe the network:
+
+	```python
+	images = layer.data("images")
+	labels = layer.data("labels")
+	w1 = pd.var("w1")
+	b1 = pd.var("b1")
+	hidden = layer.fc(images, w=w1, b=b1)
+	cost = layer.mse(hidden, labels)
+	```
+
+	The above code snippet will create forward operators in [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md).
+
+
+2. Users create a certain kind of Optimizer with some argument.
+
+	```python
+	optimizer = AdagradOptimizer(learing_rate=0.001)
+	```
+
+3. Users use the optimizer to `minimize` a certain `cost` through updating parameters in parameter_list.
+
+	```python
+	opt_op_list = optimizer.minimize(cost, parameter_list=[w1, b1])
+	```
+	The above code snippet will create gradient and optimization operators in Block. The return value of `minimize()` is list of optimization operators that will be run by session.
+
+4. Users use Session/Executor to run this opt_op_list as target to do training.
+
+	```python
+	sess.run(target= opt_op_list, ...)
+	```
+
+#### Optimizer Python interface:
+
+```python
+class Optimizer(object):
+    """Optimizer Base class.
+
+    """
+
+    def __init__(self):
+        pass
+
+    def create_optimization_pass(self, parameters_and_grads):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          optmization_op_list: a list of optimization operator that will update parameter using gradient.
+        """
+        return None
+
+    def minimize(self, loss, parameter_list):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward_ops()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = self.create_backward_pass(loss, parameter_list)
+        update_ops = self.create_optimization_pass(params_grads)
+        return update_ops
+
+```
+
+Users can inherit the Optimizer above to create their own Optimizer with some special logic, such as AdagradOptimizer.
diff --git a/doc/design/program.md b/doc/design/program.md
index fb8f86ac07af403c9fee015f2a3adbfaa3c6d631..bd2456787c4e336d357a65255a8274a7c9e465cc 100644
--- a/doc/design/program.md
+++ b/doc/design/program.md
@@ -1,8 +1,10 @@
-# Design Doc: ProgramDesc
+# Design Doc: PaddlePaddle Programs
 
-The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+## Compile and Execution
+
+A PaddlePaddle program consists of two parts -- the first generates a `ProgramDesc` protobuf message that describes the program, and the second runs this message using a C++ class `Executor`.
 
-As described in [graph.md](./graph.md), the first five lines of the following PaddlePaddle program
+A simple example PaddlePaddle program can be found in [graph.md](./graph.md):
 
 ```python
 x = layer.data("images")
@@ -13,36 +15,112 @@ optimize(cost)
 train(cost, reader=mnist.train())
 ```
 
-generates, or compiles, a PaddelPaddle program, which is represented by the following protobuf message:
+The first five lines of the following PaddlePaddle program generates, or, compiles, the `ProgramDesc` message.  The last line runs it.
 
-```protobuf
-message ProgramDesc {
-  repeated BlockDesc blocks = 1;
+## Programs and Blocks
+
+The basic structure of a PaddlePaddle program is some nested blocks, as a C++ or Java program.
+
+- program: some nested blocks
+- [block](./block.md):
+  - some local variable definitions, and
+  - a sequence of operators
+
+The concept of block comes from usual programs.  For example, the following C++ program has three blocks:
+
+```c++
+int main() { // block 0
+  int i = 0;
+  if (i < 10) { // block 1
+    for (int j = 0; j < 10; j++) { // block 2
+    }
+  }
+  return 0;
 }
+```
+
+The following PaddlePaddle program has three blocks:
+
+```python
+import paddle as pd  // block 0
+
+x = minibatch([10, 20, 30]) # shape=[None, 1]
+y = var(1) # shape=[1], value=1
+z = minibatch([10, 20, 30]) # shape=[None, 1]
+cond = larger_than(x, 15) # [false, true, true]
 
+ie = pd.ifelse()
+with ie.true_block():  // block 1
+    d = pd.layer.add_scalar(x, y)
+    ie.output(d, pd.layer.softmax(d))
+with ie.false_block():  // block 2
+    d = pd.layer.fc(z)
+    ie.output(d, d+1)
+o1, o2 = ie(cond)
+```
+
+## `BlockDesc` and `ProgramDesc`
+
+All protobuf messages are defined in `framework.proto`.
+
+`BlockDesc` is straight-forward -- it includes local variable definitions, `vars`, and a sequence of operators, `ops`.
+
+```protobuf
 message BlockDesc {
   required int32 parent = 1;
   repeated VarDesc vars = 2;
   repeated OpDesc ops = 3;
 }
+```
+
+The parent ID indicates the parent block so that operators in a block can refer to variables defined locally and also those defined in their ancestor blocks.
+
+All hierarchical blocks in a program are flattened and stored in an array. The block ID is the index of the block in this array.
+
+```protobuf
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+}
+```
+
+
+### Global Block
 
+The global block is the first one in the above array.
+
+## Operators that Use Blocks
+
+In the above example, the operator `IfElseOp` has two blocks -- the true branch and the false branch.
+
+The definition of `OpDesc` shows that an operator could have some attributes:
+
+```protobuf
 message OpDesc {
   AttrDesc attrs = 1;
   ...
 }
+```
+
+and an attribute could be of type block, which is, in fact, a block ID as described above:
 
+```
 message AttrDesc {
-  required AttrType type = 1;
+  required string name = 1;
 
-  // index into ProgramDesc::blocks when type==BLOCK
-  optional int32 block = 2;
+  enum AttrType {
+    INT = 1,
+    STRING = 2,
+    ...
+    BLOCK = ...
+  }
+  required AttrType type = 2;
+
+  optional int32 block = 10; // when type == BLOCK
   ...
 }
 ```
 
-When each of the first five lines runs, related Python function, e.g., `layer.fc`, calls C++ InferShape functions.  This InferShape function needs to access the properties of VarDesc's accessed by the current OpDesc. These VarDesc's might not be defined in the current block, but in some ancestor blocks.  This requires that we can trace the parent of a block.
-
-A nested block is often an attribute of an operator, most likely, an IfElseOp or a WhileOp.  In above solution, all blocks are in `ProgramDesc::blocks`, this implicitly assigns a zero-based ID to each block -- the index of the block in `ProgramDesc::blocks`.  So that `AttrDesc::block` could be an integer block ID.
+## InferShape
 
 With this design, the InferShape function should take the following parameters:
 
diff --git a/doc/design/prune.md b/doc/design/prune.md
new file mode 100644
index 0000000000000000000000000000000000000000..4a5cf10c79a554779137f0cce5494fdd96ef6b7a
--- /dev/null
+++ b/doc/design/prune.md
@@ -0,0 +1,63 @@
+# Prune
+
+## Motivation
+
+We want to support running inference, training and checkpointing in one `ProgramDesc`. We implement 
+`void Prune(const ProgramDesc* input, ProgramDesc* output)` function, which takes a `ProgramDesc`
+and generate a pruned `ProgramDesc`.
+
+## Challenge
+
+Pruning need to support both variables and operators being evaluation targets. Consider the following
+different situations.
+
+```python
+# Case 1: run foward pass.
+cost_np = session.run(target=cost)
+# Case 2: run backward passing.
+opts_np, _ = session.run(target=[cost, opt])
+# Case 3: run checkpointing
+_ = session.run(target=checkpoint)
+```
+
+## Solution
+
+To support evaluation of operators, we add `is_target` field in the `OpDesc`.
+
+```c++
+message OpDesc {
+  required string type = 3;
+  repeated Var inputs = 1;
+  repeated Var outputs = 2;
+  repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
+};
+```
+
+To support evaluation of variables, we add [fetch_op](https://github.com/PaddlePaddle/Paddle/pull/4599).
+For each variable in the `target`, we insert a `fetch_op` into the `ProgramDesc` with `variable` being
+`fetch_op`'s input. Then we also set `fetch_op` is a target.
+
+### Algorithm
+
+If an operator needs to be run, it must fall into one of the following cases:
+
+1. It is the target.
+2. It is depended by some other ops, meaning its output is some other op's input.
+
+The first case can be checked by `op_desc.is_traget()` . The second case can be implement as
+
+```c++
+bool HasDependentVar(const OpDesc& op_desc, const std::set<string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+```
+
+Then the whole algorithm can be implemented as the following [code](https://github.com/tonyyang-svail/Paddle/blob/prune_impl/paddle/framework/prune.cc).
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c
--- /dev/null
+++ b/doc/design/python_api.md
@@ -0,0 +1,284 @@
+# Design Doc: Python API
+
+Due to the refactorization of the PaddlePaddle core, we need Python classes to construct corresponding protobuf messages that describe a DL program.
+
+| Python classes | Protobuf messages |
+| --- | --- |
+| Program | ProgramDesc |
+| Block | BlockDesc |
+| Operator | OpDesc |
+| Variable | VarDesc |
+
+Please be aware that these Python classes need to maintain some construction-time information, which are not part of the protobuf messages.
+
+## Core Concepts
+
+### Program
+
+A `ProgramDesc` describes a [DL program](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md), which is composed of an array of `BlockDesc`s.  The `BlockDesc`s in a `ProgramDesc` can have a tree-like hierarchical structure. However, the `ProgramDesc` onlys stores a flattened array of `BlockDesc`s. A `BlockDesc` refers to its parent block by its index in the array.  For example, operators in the step block of an RNN operator need to be able to access variables in its ancestor blocks.
+
+Whenever we create a block, we need to set its parent block to the current block, hence the Python class `Program` needs to maintain a data member `current_block`.
+
+```python
+class Program(objects):
+    def __init__(self):
+        self.desc = core.NewProgram() # a C++ ProgramDesc pointer.
+        self.blocks = vector<Block>()
+        self.blocks.append(Block(self, -1)) # the global block
+        self.current_block = 0          # initialized to the global block
+
+    def global_block():
+        return self.blocks[0]
+
+    def current_block():
+        return self.get_block(self.current_block)
+
+    def rollback():
+        self.current_block = self.current_block().parent_idx
+
+    def create_block():
+        new_block_idx = len(self.block)
+        self.blocks.append(Block(self, self.current_block))
+        self.current_block = new_block_idx
+        return current_block()
+```
+
+`Program` is an accessor to the protobuf message `ProgramDesc`, which is created in C++ space, because the InferShape function is in C++, which manipulates `VarDesc` messages, which are in turn members of `BlockDesc`, which is a member of `ProgramDesc`.
+
+`Program` creates the first block as the global block in its constructor.  All parameters and their initializer operators are in the global block.
+
+### Block
+
+A [Block](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/block.md) includes
+
+1. a map from variable names to an instance of the Python `Variable` class, and
+1. a list of `Operator` instances.
+
+```python
+class Block(objects):
+    def __init__(self, program, parent_idx):
+        self.desc = core.NewBlock(program.desc)
+        self.program = program
+        self.vars = map<string, Variable>()
+        self.ops = vector<Operator>()
+        self.parent_idx = parent_idx
+
+    def create_var(self, ...):
+        return Variable(self, ...)
+
+    def _create_global_var(self, ...):
+        program.global_block().create_var(...)
+
+    def create_parameter(self, name, ...):
+        # Parameter is a subclass of variable. See Parameter section for details.
+        self.vars[name] = Parameter(self._create_global_var(...), ...)
+        return self.vars[name]
+
+    def append_operator(self, ...):
+        self.ops.append(Operator(self, ...))
+
+    def prepend_operator(self, ...): # Parameter's ctor prepands initialize operators.
+       self.ops.prepend(Operator(self, ...))
+```
+
+`create_parameter` is necessary because parameters are global variables, defined in the global block, but can be created in some sub-blocks. For example, an FC layer in the step block of an RNN operator.
+
+`prepend_operator` is necessary because the constructor of `Parameter` needs to create the initialize (or load) operator of the parameter, and would like to put it in the *preamble* of the global block.
+
+### Operator
+
+The `Operator` class fills in the `OpDesc` message and calls the C++ function `InferShape` to infer the output shapes from the input shapes.
+
+```python
+class Operator(object):
+    def __init__(self,
+                 block,  # Block
+                 type,   # string
+                 inputs, # dict<string, Variable>
+                 outputs,# dict<stirng, Variable>
+                 attrs   # dict<string, Any>
+                 ):
+        self.desc = core.NewOpDesc(block.desc, type, inputs, outputs, attrs)
+        core.infer_shape(self.desc, inputs, outputs)
+
+    def type(self):
+        return self.desc.type()
+```
+
+`Operator` creates the `OpDesc` message in C++ space, so that it can call the `InferShape` function, which is in C++.
+
+### Variable
+
+Operators take Variables as its inputs and outputs.
+
+```python
+class Variable(object):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 ):
+        if name is None:
+            name = unique_name_generator()
+        self.name = name
+        self.block = block
+        self.desc = core.NewVarDesc(block.desc, name, shape, lod_level)
+        self.writer = None
+```
+
+Please be aware of `self.writer`, that tracks operator who creates the variable.  It possible that there are more than one operators who write a variable, but in Python space, each write to a variable is represented by a Variable class.  This is guaranteed by the fact that **`core.NewVarDesc` must NOT create a new `VarDesc` message if its name already exists in the specified block**.
+
+### Parameter
+
+A parameter is a global variable with an initializer (or load) operator.
+
+```python
+class Parameter(Variable):
+    def __init__(self,
+                 block=None,      # Block
+                 name=None,       # string
+                 shape,           # tuple
+                 dtype="float32", # string
+                 lod_level=None   # int
+                 trainable,       # bool
+                 initialize_op_attrs,
+                 optimize_op_attrs):
+        super(Parameter, self).__init__(block, name, shape, dtype, lod_level)
+        self.trainable = trainable
+        self.optimize_op_attrs = optimize_op_attrs
+        block.prepend(Operator(block,  # Block
+                               initialize_op_attrs['type'],   # string
+                               None,   # no inputs
+                               self,   # output is the parameter
+                               initialize_op_attrs)
+```
+
+When users create a parameter, they can call
+
+```python
+program.create_parameter(
+  ...,
+  init_attr={
+    type: "uniform_random",
+    min: -1.0,
+    max: 1.0,
+  })
+)
+```
+
+In above example, `init_attr.type` names an initialize operator.  It can also name the load operator
+
+```python
+init_attr={
+ type: "load",
+ filename: "something.numpy",
+}
+```
+
+`optimize_op_attrs` is not in the `VarDesc` message, but kept in the Python instance, as it will be used in the Python space when creating the optimize operator's `OpDesc`, and will be in the `OpDesc` message.
+
+## Layer Function
+
+A layer is a Python function that creates some operators and variables. Layers simplify the work of application programmers.
+
+Layer functions take `Variable` and configuration parameters as its input and return the output variable(s).
+
+For example, `FullyConnected` take one or more variable as its input. The input could be input data or another layer's output. There are many configuration options for a `FullyConnected` layer, such as layer size, activation, parameter names, initialization strategies of parameters, and so on. The `FullyConnected` layer will return an output variable.
+
+
+### Necessity for reusing code between layer functions
+
+There are a lot of code that can be reused. Such as
+
+* Give the default value of configuration. e.g., default initialize strategy for parameters is uniform random with `min = -1.0`, `max = 1.0`. and default initialize strategy for bias is to fill zero.
+* Append the activation operator.
+* Create a temporary variable.
+* Create parameter.
+* Generate a unique name.
+* Add a bias.
+* ...
+
+A mechanism to reuse code between layer functions is necessary. It will be around [150 lines of code](https://github.com/PaddlePaddle/Paddle/pull/4724/files#diff-823b27e07e93914ada859232ae23f846R12) if we write a `FullyConnected` layer without any helper functions.
+
+
+
+### Comparision between global functions and helper class
+
+The `FullyConnected` layer will be as follow when we provide global functions:
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  if name is None:
+    name = unique_name("fc")
+  input = multiple_input(input)
+  param_attr = default_param_attr(param_attr)
+  param_attr = multiple_param_attr(param_attr, len(input))
+
+  # mul
+  mul_results = []
+  for ipt, attr in zip(input, param_attr):
+    shape = ipt.shape[1:] + [size]
+    w = g_program.global_block().create_parameter(shape, ipt.dtype, name, attr)
+    tmp = create_tmp_var(name)
+    g_program.current_block().append_op("mul", {ipt, w}, {tmp})
+  mul_results.append(tmp)
+
+  # add sum
+  ...
+  # add bias
+  ...
+  # add activation
+  ...
+  return out
+```
+
+We can provide many helpers functions for layer developers. However, there are several disadvantages for global helper functions:
+
+1. We need a namespace for these methods, then layer developers can quickly figure out what method they can use.
+2. Global functions will force layer developers to pass its parameter time by time.
+
+So we provide a helper class, `LayerHelper`, to share code between layer functions. The `FullyConnected` Layer will be as follow.
+
+```python
+def fc_layer(input, size, param_attr=None, bias_attr=None, act=None, name=None):
+  helper = LayerHelper(locals())  # pass all parameter to LayerHelper
+
+  mul_results = []
+  for ipt, param in helper.iter_multiple_input_and_param():
+    w = helper.create_parameter(shape=ipt.shape[1:] + [size], dtype = ipt.dtype)
+    tmp = helper.create_tmp_variable()
+    helper.append_op('mul', {ipt, w}, {tmp})
+    mul_results.append(tmp)
+
+  pre_bias = helper.add_sum(mul_results)
+  pre_activation = helper.add_bias(pre_bias)
+  return helper.add_activation(pre_activation)
+```
+
+We not only use the fewer lines of code to write `fc_layer` but also make the code clearer to understand. At the same time, layer developers can figure out what function they can invoke by typing `helper.` in a python editor.
+
+
+### Implementation of layer helper
+
+We just keep all parameters of a layer function as a dictionary in layer helper as a private data member. Every method of layer helper will look up the dictionary after it is invoked. In that way, we can implement a layer helper for all layer functions even some layer does not contain some operator. For example, The `activation` is used by the FullyConnected layer or convolution layers, but a cross-entropy layer does not use it. The example code of `add_activation` are:
+
+```python
+class LayerHelper(object):
+  def __init__(self, **kwargs):  # kwargs is short for `keyword arguments`
+    self.kwargs = kwargs
+
+  def add_activation(self, input_var):
+    act = self.kwargs.get("act", None)  # default value is None
+    if act is None:  # do nothing if no act
+      return input_var
+
+    tmp = self.create_tmp_var(self)
+    self.append_op(type=act, input=input_var, output=tmp)
+    return tmp
+```
+
+## Optimizer
+
+[Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md
new file mode 100644
index 0000000000000000000000000000000000000000..1d9a26683c14f54e3b5fe41675cd03b5620646b8
--- /dev/null
+++ b/doc/design/refactor/session.md
@@ -0,0 +1,180 @@
+# Design Doc: Session
+
+## Abstract
+
+The *session* object encapsulates the environment in which the
+computation graph is executed.
+
+We will have the *local* session and *remote* session, they offer the
+same [interface](#interface). The local session encapsulates the local
+runtime environment and the remote session encapsulates the cluster
+runtime environment.
+
+The local runtime environment contains:
+
+1. computation devices (i.e., CPU, GPU) handles, and
+1. the [scope](../scope.md) which holds all variables.
+
+The remote runtime environment contains:
+
+1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
+   and
+1. the distributed [scope](../scope.md) in a cluster which holds all
+   variables.
+
+The user can create a remote session on Paddle Cloud and evaluate the
+computation graph with it. In this way, the user can control the
+remote computation resource in a cluster from his local computer.
+
+
+## Background
+
+The current design has an implicit global session in which
+`paddle.eval()` is executed. The pain point is:
+
+Since the user is not able to explicitly switch between runtime
+environments, the user cannot run a topology in two independent
+environments.
+
+For example, in reinforcement learning, the user may want to have a
+stale model for inference and a fresh model for training, and only
+replace the stale model with the fresh model periodically.
+
+Furthermore, we have no concept that encapsulates a remote environment
+that executes a computation graph.
+
+We need the session object to address above issues.
+
+
+## Session
+
+A session is an object that owns the runtime environment. All
+computations are executed through `session.eval()`.
+
+
+### Interface
+
+```python
+eval(
+    targets,
+    feed_dict=None,
+)
+```
+
+Evaluates the target Operations or Variables in `targets`.
+
+- *targets*: the evaluation targets. Can be a single Operation or
+  Variable, or a list with the Operations or Variables as
+  elements. The value returned by `eval()` has the same shape as the
+  `target` argument.
+
+  The PaddlePaddle program is represented by
+  the [ProgramDesc](../design/program.md), `eval()` will infer the
+  ProgramDesc from the given targets and run the PaddlePaddle
+  program. Please
+  see
+  [this graph](./distributed_architecture.md#local-training-architecture) for
+  the detailed illustration for the local session
+  and
+  [this graph](./distributed_architecture.md#distributed-training-architecture) for
+  the detailed illustration for the remote session.
+
+- *feed_dict*: a dictionary that contains the tensors which override
+  the edges of the computation graph.
+
+  feed_dict not only can provide the input data, it can override any
+  OP's input as well:
+
+  ```python
+  a = pd.constant(2.0, name="a")
+  b = pd.variable(name="b")
+  c = pd.mul(a,b)
+  sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
+  ```
+
+```python
+close()
+```
+
+Closes the session and releases the scope that the session owns.
+
+
+### Create a Local Session
+
+```python
+session(
+    devices=None
+)
+```
+
+Creates a new session. One session owns one global scope, so creating
+multiple sessions will create different scopes.
+
+- *devices*: a single `string` or a list of `string` of device names,
+  the corresponding devices will be the computation devices for
+  `eval()`. If not specified, all available devices (e.g., all GPUs)
+  will be used. The user doesn't need to specify the CPU device since
+  it will be always used. Multiple sessions can use the same device.
+
+
+#### Example
+
+```Python
+a = paddle.constant(1.0)
+b = paddle.constant(2.0)
+c = a + b
+sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
+sess.eval(c)
+sess.close()
+```
+
+### Create a Remote Session
+
+```python
+create_cloud_job(
+    name,
+    num_trainer,
+    mem_per_trainer,
+    gpu_per_trainer,
+    cpu_per_trainer,
+    num_ps,
+    mem_per_ps,
+    cpu_per_ps,
+)
+```
+
+Creates a Paddle Cloud job. Fails if the job name exists.
+
+```python
+get_cloud_job(
+    name
+)
+```
+
+Gets a Paddle Cloud job.
+
+```python
+remote_session(
+    job
+)
+```
+
+- *job*: the Paddle Cloud job.
+
+#### Example
+
+```Python
+reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
+image = reader.column(0)
+label = reader.column(1)
+fc1 = paddle.op.fc(image, size=256, act="sigmoid")
+fc2 = paddle.op.fc(fc1, size=10, act="softmax")
+cost = paddle.op.cross_entropy(fc2, label)
+opt = paddle.optimizer.sgd(cost)
+
+job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
+sess = paddle.remote_ession(job)
+for i in range(1000):
+    sess.eval(opt)
+sess.close()
+```
diff --git a/doc/design/refactorization.md b/doc/design/refactorization.md
index ad801ca421ca31c84b0a6b0a18d1d625c87e0de5..f93d6155e1764386b01d2f0df3f141ab75cd55d4 100644
--- a/doc/design/refactorization.md
+++ b/doc/design/refactorization.md
@@ -1,40 +1,40 @@
 # Design Doc: Refactorization Overview
 
-The goal of refactorizaiton include:
+The goals of refactoring include:
 
-1. Make it easy for external contributors to write new elementory computaiton operations.
-1. Make the codebase clean and readable.
-1. Introduce a new design of computation representation -- a computation graph of operators and variables.
-1. The graph representation helps implementing auto-scalable and auto fault recoverable distributed computing.
+1. Making it easy for external contributors to write new elementary computation operations.
+1. Making the codebase clean and readable.
+1. Designing a new computation representation -- a computation graph of operators and variables.
+1. Implementing auto-scalability and auto fault recoverable distributed computing with the help of computation graphs.
 
 ## Computation Graphs
 
-1. PaddlePaddle represent the computation, training and inference of DL models, by computation graphs.
+1. PaddlePaddle represents the computation, training and inference of Deep Learning models, by computation graphs.
 
-  1. Please dig into [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a solid example.
+  1. Please refer to [computation graphs](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/graph.md) for a concrete example.
 
-1. Users write Python programs to describe the graphs and run it (locally or remotely).
+1. Users write Python programs to describe the graphs and run them (locally or remotely).
 
 1. A graph is composed of *variables* and *operators*.
 
-1. The description of graphs must be able to be serialized/deserialized, so it
+1. The description of graphs must be serializable/deserializable, so that:
 
-   1. could to be sent to the cloud for distributed execution, and
-   1. be sent to clients for mobile or enterprise deployment.
+   1. It can be sent to the cloud for distributed execution, and
+   1. It can be sent to clients for mobile or enterprise deployment.
 
-1. The Python program do
+1. The Python program does two things
 
-   1. *compilation*: runs a Python program to generate a protobuf message representation of the graph and send it to
+   1. *Compilation* runs a Python program to generate a protobuf message representation of the graph and send it to
       1. the C++ library `libpaddle.so` for local execution,
       1. the master process of a distributed training job for training, or
       1. the server process of a Kubernetes serving job for distributed serving.
-   1. *execution*: according to the protobuf message, constructs instances of class `Variable` and `OperatorBase`, and run them.
+   1. *Execution* executes the graph by constructing instances of class [`Variable`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h#L24) and [`OperatorBase`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L70), according to the protobuf message.
 
-## Description and Realization
+## Description and Realization of Computation Graph
 
-At compile time, the Python program generates protobuf message representation of the graph, or the description of the graph.
+At compile time, the Python program generates a protobuf message representation of the graph, or a description of the graph.
 
-At runtime, the C++ program realizes the graph and run it.
+At runtime, the C++ program realizes the graph and runs it.
 
 | | Representation (protobuf messages) | Realization (C++ class objects) |
 |---|---|---|
@@ -42,30 +42,31 @@ At runtime, the C++ program realizes the graph and run it.
 |Operation|[OpDesc](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L35)|[Operator](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L64)|
 |Block|BlockDesc|Block|
 
-The word *graph* is exchangable with *block* in this document.  A graph represent computation steps and local variables as a C++/Java program block, or a pair of { and }.
+The word *graph* is interchangeable with *block* in this document.  A graph consists of computation steps and local variables similar to a C++/Java program block, or a pair of parentheses(`{` and `}`).
 
 ## Compilation and Execution
 
-1. Run an applicaton Python program to describe the graph.  In particular,
+1. Run a Python program to describe the graph.  In particular, the Python application program does the following:
 
-   1. create VarDesc to represent local/intermediate variables,
-   1. create operators and set attributes,
-   1. validate attribute values,
-   1. inference the type and the shape of variables,
-   1. plan for memory-reuse for variables,
-   1. generate backward and optimization part of the Graph.
-   1. possiblly split the graph for distributed training.
+   1. Create `VarDesc` to represent local/intermediate variables,
+   1. Create operators and set attributes,
+   1. Validate attribute values,
+   1. Infer the type and the shape of variables,
+   1. Plan memory-reuse for variables,
+   1. Generate the backward graph
+   1. Add optimization operators to the computation graph.
+   1. Optionally, split the graph for distributed training.
 
-1. The invocation of `train` or `infer` in the application Python program:
+1. The invocation of `train` or [`infer`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/inference.py#L108) methods in the Python program does the following:
 
-   1. create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
+   1. Create a new Scope instance in the [scope hierarchy](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/scope.md) for each run of a block,
       1. realize local variables defined in the BlockDesc message in the new scope,
       1. a scope is similar to the stack frame in programming languages,
 
-   1. create an instance of class `Block`, in which,
+   1. Create an instance of class `Block`, in which,
       1. realize operators in the BlockDesc message,
 
-   1. run the Block by calling
+   1. Run the Block by calling
       1. `Block::Eval(vector<Variable>* targets)` for forward and backward computations, or
       1. `Block::Eval(vector<Operator>* targets)` for optimization.
 
@@ -76,14 +77,14 @@ The word *graph* is exchangable with *block* in this document.  A graph represen
 Compile Time -> IR -> Runtime
 ```
 
-### Benefit
+### Benefits of IR
 
 - Optimization
   ```text
   Compile Time -> IR -> Optimized IR -> Runtime
   ```
-- Send automatically partitioned IR to different nodes.
-  - Automatic data parallel
+- Automatically send partitioned IR to different nodes.
+  - Automatic Data Parallelism
     ```text
     Compile Time
     |-> Single GPU IR
@@ -92,7 +93,7 @@ Compile Time -> IR -> Runtime
             |-> Node-1 (runs trainer-IR-1)
             |-> Node-2 (runs pserver-IR)
     ```
-  - Automatic model parallel (planned for future)
+  - Automatic Model Parallelism (planned for future)
 
 ---
 
@@ -105,10 +106,10 @@ Compile Time -> IR -> Runtime
 # Operator
 ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
 
-* `Operator` is the fundamental building block as the user interface.
-    * Operator stores input/output variable name, and attributes.
-    * The `InferShape` interface is used to infer output variable shapes by its input shapes.
-    * Use `Run` to compute `input variables` to `output variables`.
+* `Operator` is the fundamental building block of the user interface.
+    * Operator stores input/output variable names and attributes.
+    * The `InferShape` interface is used to infer the shape of the output variables based on the shapes of the input variables.
+    * Use `Run` to compute the `output` variables from the `input` variables.
 
 ---
 
@@ -126,30 +127,29 @@ Compile Time -> IR -> Runtime
 # Why separate Kernel and Operator
 
 * Separate GPU and CPU code.
-    * Make Paddle can run without GPU.
-* Make one operator (which is user interface) can contain many implementations.
-    * Same mul op, different FP16, FP32 Kernel. different MKL, eigen kernel.
+    * Make Paddle capable of running without GPU.
+* Make one operator (which is a user interface) and create many implementations.
+    * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
 ---
 
 # Libraries for Kernel development
 
 * `Eigen::Tensor` contains basic math and element-wise functions.
     * Note that `Eigen::Tensor` has broadcast implementation.
-    * Limit number of `tensor.device(dev) = ` in your code.
-* `thrust::tranform` and `std::transform`.
-    * `thrust` has the same API as C++ standard library. Using `transform` can quickly implement a customized elementwise kernel.
-    * `thrust` has more complex API, like `scan`, `reduce`, `reduce_by_key`.
+    * Limit the number of `tensor.device(dev) = ` in your code.
+* `thrust::transform` and `std::transform`.
+    * `thrust` has the same API as C++ standard library. Using `transform`, one can quickly implement customized element-wise kernels.
+    * `thrust`, in addition, supports more complex APIs, like `scan`, `reduce`, `reduce_by_key`.
 * Hand-writing `GPUKernel` and `CPU` code
-    * Do not write `.h`. CPU Kernel should be in `.cc`. GPU kernel should be in `.cu`. (`GCC` cannot compile GPU code.)
+    * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
 ---
-# Operator Register
+# Operator Registration
 
-## Why register is necessary?
+## Why is registration necessary?
 We need a method to build mappings between Op type names and Op classes.
 
-## How to do the register?
-
-Maintain a map, whose key is the type name and value is corresponding Op constructor.
+## How is registration implemented?
+Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
 
 ---
 # The Registry Map
@@ -169,7 +169,7 @@ Maintain a map, whose key is the type name and value is corresponding Op constru
 # Related Concepts
 
 ### Op_Maker
-It's constructor takes `proto` and `checker`. They are compeleted during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
+It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
 
 ### Register Macros
 ```cpp
@@ -177,34 +177,30 @@ REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type, grad_op_class)
 REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
 ```
 
-### `USE` Macros
-make sure the registration process is executed and linked.
-
 ---
-# Register Process
-1. Write Op class, as well as its gradient Op class if there is.
-2. Write Op maker class. In the constructor, describe its inputs, outputs, and attributes.
-3. Invoke macro `REGISTER_OP`. The macro will
-	1. call maker class to complete `proto` and `checker`
-	2. with the completed `proto` and `checker`, build a new key-value pair in the `OpInfoMap`
-
-4. Invoke `USE` macro in where the Op is used to make sure it is linked.
+# Registration Process
+1. Write an Op class and its gradient Op class, if required.
+2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
+3. Invoke the macro `REGISTER_OP`. This macro will
+	1. Call maker class to complete `proto` and `checker`
+	2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
 
 ---
 # Backward Module (1/2)
 ### Create Backward Operator
-- Mapping from forwarding Op to backward Op
+- Mapping from forward Op to backward Op
 ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
 
 ---
 # Backward Module (2/2)
 ### Build Backward Network
-- **Input** graph of forwarding operators
-- **Output** graph of backward operators
-- **corner case in construction**
-	- shared variable => insert `Add` operator
-	- no gradient => insert `fill_zero_grad` operator
-	- recursive netOp => call `Backward` recursively
+- **Input**: a graph of forward operators
+- **Output**: a graph of backward operators
+- **Corner cases in construction**
+	- Shared Variables => insert an `Add` operator to combine gradients
+	- No Gradient => insert a `fill_zero_grad` operator
+	- Recursive NetOp => call `Backward` recursively
+	- RNN Op => recursively call `Backward` on stepnet
 	- RNN Op => recursively call `Backward` on stepnet
 
 
@@ -213,41 +209,41 @@ make sure the registration process is executed and linked.
 
 * `Tensor` is an n-dimension array with type.
 	* Only dims and data pointers are stored in `Tensor`.
-	* All operators on `Tensor` is written in `Operator` or global functions.
-	* variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
-* `Variable` is the inputs and outputs of an operator. Not just `Tensor`.
-	* step_scopes in RNN is a variable and not a tensor.
-* `Scope` is where variables store at.
-	* map<string/*var name */, Variable>
-	* `Scope` has a hierarchical structure. The local scope can get variable from its parent scope.
+	* All operations on `Tensor` are written in `Operator` or global functions.
+	* Variable length Tensor design [LoDTensor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor.md)
+* `Variable` instances are the inputs and the outputs of an operator, not just `Tensor`.
+	* `step_scopes` in RNN is a variable and not a tensor.
+* `Scope` is where variables are stored.
+	* map<string `var name`, Variable>
+	* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
 
 ---
 # Block (in design)
-## the difference with original RNNOp
-- as an operator is more intuitive than `RNNOp`,
-- offers new interface `Eval(targets)` to deduce the minimal block to `Run`,
-- fits the compile-time/ runtime separation design.
-  - during the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
-  - when graph executes, a Block with `BlockDesc` passed in creates `Op` and `Var` then `Run`
+## the difference between original RNNOp and Block
+- As an operator is more intuitive than `RNNOp`,
+- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
+- Fits the compile-time/ runtime separation design paradigm.
+  - During the compilation, `SymbolTable` stores `VarDesc`s and `OpDesc`s and serialize to a `BlockDesc`
+  - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
 
 ---
 # Milestone
-- take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
-- model migration
-  - framework development gives **priority support** to model migration, for example,
+- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
+- Model migration
+  - Framework development gives **priority support** to model migration, for example,
     - the MNIST demo needs a Python interface,
     - the RNN models require the framework to support `LoDTensor`.
-  - determine some timelines,
-  - heavily-relied Ops need to be migrated first,
-  - different models can be migrated parallelly.
-- improve the framework at the same time
-- accept imperfection, concentrated on solving the specific problem at the right price.
+  - Determine some timelines,
+  - Frequently used Ops need to be migrated first,
+  - Different models can be migrated in parallel.
+- Improve the framework at the same time
+- Accept imperfection, concentrate on solving the specific problem at the right price.
 
 ---
 # Control the migration quality
-- compare the performance of migrated models with old ones.
-- follow google C style
-- build the automatic workflow of generating Python/C++ documentations
-  - the documentation of layers and ops should be written inside the code
-  - take the documentation quality into account when doing PR
-  - preview the documentations, read and improve them from users' perspective
+- Compare the performance of migrated models with old ones.
+- Follow the google C++ style guide.
+- Build the automatic workflow of generating Python/C++ documentations.
+  - The documentation of layers and ops should be written inside the code.
+  - Take the documentation quality into account when submitting pull requests.
+  - Preview the documentations, read and improve them from a user's perspective.
diff --git a/doc/design/register_grad_op.md b/doc/design/register_grad_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..8d973eb53178c3e889c845144553a453e11f067c
--- /dev/null
+++ b/doc/design/register_grad_op.md
@@ -0,0 +1,92 @@
+# Design Doc: Gradient Operators Registration
+
+
+## The Problem Posed
+
+Currently, for each C++ operator class definition, a *gradient operator creator* function is registered, which takes as input a C++ operator instance and returns the corresponding gradient operator instance.
+
+However, we noticed two problems with the current design:
+
+1. As we decided to separate the *compilation* and the *execution* phases, we need to change the creator to take an `OpDesc` protobuf message in a `ProgramDesc` and inserts corresponding `OpDesc` messages into the `ProgramDesc` message.
+
+1. For some operators, the gradient computation can be written in terms of existing operators.  For example, the gradient of *minus* operator consists of two operators -- an *identity* operator followed by a *scale* operator.  Hence the registration mechanism needs to support mapping from an operator to a set of operators for the gradient computation.
+
+## The Current Implementation
+
+Instances of the C++ class `OpInfo` are stored an associative map whose key is the operator type. The `grad_op_type` indicates the associated gradient operator type. An operator can create the gradient operator by invoking `OpInfo::creator_` of the gradient operator. The pseudo code is as follows
+
+```cpp
+struct OpInfo {
+  std::function<OperatorBase*(...)> creator_;
+  std::string grad_op_type_;
+  ...
+};
+
+map<string, OpInfo> OpInfoMap;
+
+OperatorBase* CreateGradientOperator(const OperatorBase& op) {
+  return OpInfoMap.at(op.Type()).creator_(...);
+}
+```
+
+## Proposed Solution
+
+The mapping relationship between an operator and its gradient operators is a function. The interface of this function is:
+
+```cpp
+// (OpDesc) --> vector<OpDesc>
+std::function<std::vector<OpDescBind>(const OpDescBind&)>;
+```
+
+The function takes an `OpDescBind` of the forward operator and returns one or many gradient operator descriptions. `OpDescBind` is a C++ wrapper for  the protobuf message `OpDesc` for rapid manipulation of `OpDesc`.
+
+The `GradOpDescMaker` will be registered in `OpInfo` and will replace the `grad_op_type_` field. The `OpInfo` should look like 
+
+```cpp
+struct OpInfo {
+  std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>  grad_op_maker_;
+  ...
+};
+```
+
+The `grad_op_maker_ ` is a `nullptr` if the operator does not have any associated gradient operators.
+
+We propose a base class called `GradOpDescMakerBase` to let operator developers generate `Gradient Operators` easily. The public interface of that class is
+
+```cpp
+class GradOpDescMakerBase {
+public:
+  GradOpDescMakerBase(const OpDescBind& );
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()()const = 0;
+};
+```
+
+We can convert `GradOpDescMakerBase` to `std::function<std::vector<std::unique_ptr<OpDescBind>>(const OpDescBind&)>` by
+
+```cpp
+using GradOpMaker = ...;
+std::function<std::vector<OpDescBind>(const OpDescBind&)> func;
+func = [] (const OpDescBind& fwd_op) {
+  GradOpMaker maker(fwd_op);
+  return maker();
+};
+```
+
+We can write many helper functions since the `GradOpDescMakerBase` is a class now. The basic helper functions get the variables of `Input`, `Output`, `InputGradient` and `OutputGradient` in the forwarding operator.
+
+We should change register macros at the same time. In the current solution, there is no difference between forwarding operators and backward operators. So `REGISTER_OP` just register one operator. If the `REGISTER_OPERATOR ` contains `OpProtoAndCheckerMaker` and `GradOpDescMaker`, we just list them in the same macro. It can be done by a macro contains `__VA_ARGS__`.
+
+The user interface should be
+
+```cpp
+vector<OpDesc> MinusOpGradMaker(OpDesc) {...}
+REGISTER_OPERATOR(minus, MinusOp, MinusOpProtoAndCheckerMaker, SumOpGradMaker);
+// Developers can still manually implement gradient operator.
+REGISTER_OPERATOR(minus_grad, MinusGradOp);
+```
+
+The interface of current `REGISTER_OP` macro could not be changed. In `REGISTER_OP`, it will invoke `REGISTER_OPERATOR` two times and generate GradOpDescMaker inside.
+
+```cpp
+REGISTER_OP(minus, MinusOp, MinusOpProtoAndCheckerMaker, minus_grad, MinusGradOp);
+```
diff --git a/doc/design/regularization.md b/doc/design/regularization.md
new file mode 100644
index 0000000000000000000000000000000000000000..703a9fbdd4392aa7f44733cce2da19caa1b51e4a
--- /dev/null
+++ b/doc/design/regularization.md
@@ -0,0 +1,103 @@
+# Regularization in PaddlePaddle
+
+## Introduction to Regularization
+A central problem in machine learning is how to design an algorithm that will perform well not just on the training data, but also on new data. Many strategies are used by machine learning practitioners to reduce the test error, possibly at the expense of increased training error. These strategies are collectively known as **regularization**. 
+
+### Parameter Norm Penalties
+Most common regularization approaches in deep learning are based on limiting the capacity of the models by adding a parameter norm penalty to the objective function `J`. This is given as follows:
+
+<img src="./images/loss_equation.png" align="center"/><br/>
+
+The parameter `alpha` is a hyperparameter that weights the relative contribution of the norm penalty term, `omega`, relative to the standard objective function `J`.
+
+The most commonly used norm penalties are the L2 norm penalty and the L1 norm penalty. These are given as follows:
+
+##### L2 Regularization:
+<img src="./images/l2_regularization.png" align="center"/><br/>
+
+##### L1 Regularization
+<img src="./images/l1_regularization.png" align="center"/><br/>
+
+A much more detailed mathematical background of reguilarization can be found [here](http://www.deeplearningbook.org/contents/regularization.html).
+
+
+## How to do Regularization in PaddlePaddle
+
+On surveying existing frameworks like Tensorflow, PyTorch, Caffe, etc, it can be seen that there are 2 common approaches of doing regularization:
+
+1. Making regularization a part of the optimizer using an attribute like `weight_decay` that is used to control the scale of the L2 Penalty. This approach is used in PyTorch as follows:
+	```python
+	opt =  torch.optim.SGD(params, lr=0.2, weight_decay=0.2)
+	```
+    At every optimization step, this code will add the gradient of the L2 Norm of the params to the gradient of the params with respect to the loss function. This can seen in the following code snippet:
+    ```python
+    if weight_decay != 0:
+    	d_p.add_(weight_decay, p.data)
+    ```
+    This is a very restyrictive way of doing regularization and does not give the users enough flexibility. 
+    
+    **Advantages**:
+    -  It is easy to implement for us.
+    -  Faster execution of backward. However, it can be done manually by advanced users too.
+
+	**Disadvantages**:
+    - Not flexible for other regularizations such as L1/L0 regularization.
+    - Does not allow for different regularization coefficient for different parameters. For example, in most models, ony the weight matrices are regularized and the bias vectors are unregularized.
+    - Tightly coupled optimizer and regularization implementation. 
+
+
+2. Adding regularization ops to the graph through Python API. This approach is used by Tensorflow and Caffe. Using this approach, we manually add regularization ops to the graph and then add the regularization loss to the final loss function before sending them to the optimizer.
+
+	**Advantages**:
+    - Allows for greater flexibility to the users of Paddle. Using this approach, the users can put different regularization to different parameters and also choose parameters that are not a part of regularization.
+    - Makes it easy for the users to customize and extend the framework. 
+
+	**Disadvantages**:
+    - Implementation requires comprehensive design and time. 
+
+## Proposal for Regularization in PaddlePaddle
+
+### Low-Level implementation
+
+In the new design, we propose to create new operations for regularization. For now, we can add 2 ops thgat correspond to the most frequently used regularizations:
+- L2_regularization_op
+- L1_regularization_op
+
+These ops can be like any other ops with their own CPU/GPU implementations either using Eigen or separate Cpu and GPU kernels. As the initial implementation, we can implement their kernels using Eigen following the abstraction pattern implemented for [Activation Ops](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/accuracy_op.h). This abstraction pattern can make it very easy to implement new regularization schemes. other than L1 and L2 norm penalties. 
+
+The idea of building ops for regularization is in sync with the refactored Paddle philosophy of using operators to represent any computation unit. The way these ops will be added to the computation graph, will be decided by the [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) in Python API. 
+
+### Computation Graph
+
+Below is an example of a really simple feed forward neural network.
+
+<img src="./images/feed_forward.png" align="center"/><br/>
+
+The Python API will modify this computation graph to add regularization operators. The modified computation graph will look as follows:
+
+<img src="./images/feed_forward_regularized.png" align="center"/><br/>
+   
+### Python API implementation for Regularization
+
+Using the low level ops, `L2_regularization_op` and `L1_regularization_op`, any user can add regularization to their computation graphs. However, this will require a lot of lines of code and we should design Python APIs that support regularization. An example of such an API can be seen in [Keras](https://keras.io/regularizers/). As per the PaddlePaddle [Python API design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md), the layer functions are responsible for creating operators, operator parameters and variables. Since regularization is a property of parameters, it makes sense to create these in the layer functions. 
+
+#### Creation of Regularization ops
+There are two possibilities for creating the regularization ops:
+1. We create these ops immediately while building the computation graph. 
+2. We add these ops in a lazy manner, just before the backward, similar to the way the optimization ops are added. 
+
+The proposal is to add these ops in a lazy manner just before the backward pass. 
+
+#### Storage of Regularization attributes
+
+Since we want to create the regularization ops in a lazy manner, the regularization attributes (type of regularization and weight of regularization penalty) can be stored as attributes of the [`Parameter`](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/framework/framework.py#L421) class. This is because regularization is a property of the parameters and storing regularization properties with Parameters also allows for shared parameters. 
+
+#### High-level API
+
+In PaddlePaddle Python API, users will primarily rely on [layer functions](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/python_api.md#layer-function) to create neural network layers. Hence, we lso need to provide regularization functionality in layer functions. The design of these APIs can be postponed for later right now. A good reference for these APIs can be found in [Keras](https://keras.io/regularizers/) and also by looking at Tensorflow in [`tf.contrib.layers`](https://www.tensorflow.org/api_guides/python/contrib.layers).
+
+
+
+
+
+    
diff --git a/doc/design/scope.md b/doc/design/scope.md
index b1f9bb4378eb5ec6926f1e53f7c1f4fd5674064c..4da76eebb74abcd26ec2b8671399e6bc4fb58574 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,7 +37,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
  public:
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
   const Variable* FindVar(const std::string& name) const;
 
  private:
@@ -98,7 +98,7 @@ class Scope {
   Variable* FindVar(const std::string& name) const;
 
   // return if already contains same name variable.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
 
  private:
   std::shared_ptr<Scope> parent_;
@@ -107,7 +107,7 @@ class Scope {
 ```
 ## Only scope can create a variable
 
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `Var` can construct `Variable`.
 
 ## When scope destroyed, all variables inside this scope should be destroyed together
 
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 
 ## Orthogonal interface
 
-`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `Var` will return an `Error` when there is a name conflict locally. Combine `FindVar` and `Var`, we can implement `Var` easily.
diff --git a/doc/design/selected_rows.md b/doc/design/selected_rows.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a98839a957612b91b2276b58818623ecc62d1d5
--- /dev/null
+++ b/doc/design/selected_rows.md
@@ -0,0 +1,74 @@
+# Design Doc: Selected Rows
+
+`SelectedRows` is a type of sparse tensor data type, which is designed to support `embedding` operators. The gradient of embedding table is a sparse tensor. Only a few rows are non-zero values in this tensor. It is straight-forward to represent a sparse tensor by the following sparse tensor data structure:
+
+```cpp
+class SelectedRows {
+ private:
+  vector<int> rows_;
+  Tensor value_;
+  int height_;
+};
+```
+
+The field `height_` is the first dimension of `SelectedRows`. The `rows` are the indices of the non-zero rows of `SelectedRows`. The `value_` field is an N-dim tensor of shape `[rows.size() /* NUM_ROWS */, ...]`, which supplies values for each row. The dimension of `SelectedRows` satisfies `[height_] + value_.shape[1:]`.
+
+Suppose that a SelectedRows-typed variable `x` has many rows, but only two of them have values -- row 73 is `[1, 2]` and row 84 is `[3, 4]`, the `SelectedRows` representation would be:
+
+```
+x = SelectedRow {
+  rows = [73, 84],
+  value = [[1, 2], [3,4]]
+}
+```
+
+
+## SelectedRows in Protobuf
+
+`SelectedRows` is a type of `Variable`. `VarDesc` in protobuf should describe the `SelectedRows` information. Only the tensor dimension of a `SelectedRows` will be described in compile-time because the `rows_` and `value_` are dependent on the training data. 
+So we use `TensorDesc` to unify `data_type` and `dims`. A LodTensorDesc contains a `TensorDesc` and `lod_level`. The description of `SelectedRows` is a Tensor description.
+
+```proto
+message TensorDesc {
+  required DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+}
+
+message LodTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
+
+message VarDesc {
+  required string name = 1;
+  enum VarType { 
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LodTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
+}
+```
+
+## InferShape for Selected Rows
+
+Just like `LoD` information, `InferShape` method will infer the output tensor type as well. The operator should decide whether its output is a `SelectedRows` or `Dense` tensor.
+
+For example, the gradient operator of `TableLookup` will always generate `SelectedRows`. Its `InferShape` method should be like following
+
+```cpp
+void TableLookupGrad::InferShape(context) {
+  ...
+  context.SetDataType("Embedding.Grad", kSelectedRows);
+}
+```
+
+
+## Sparse Operators
+
+There are several operators that need to be written to support `SelectedRows`. These are:
+
+1. Operators which generate `SelectedRows` gradient. e.g. Gradient of `TableLookupOp`.
+2. Optimize operators which support `SelectedRows` gradient. e.g. `SGD` or `AdaGrad` for `SelectedRows`. However, there should be only one `SGD` operator. `OpWithKernel::Run` should select a suitable kernel for both `dense` tensor or `SelectedRows`.
diff --git a/doc/design/tensor_array.md b/doc/design/tensor_array.md
index a0419ec002159893b035fae1300fce489e68936a..37e4f7b90f94fa3eb015e733999cd84c96b2239c 100644
--- a/doc/design/tensor_array.md
+++ b/doc/design/tensor_array.md
@@ -1,39 +1,250 @@
 # Design for TensorArray
+This design doc presents the necessity of a new C++ class `TensorArray`.
+In addition to the very simple C++ implementation
+
+```c++
+class TensorArray {
+ public:
+  explicit TensorArray(const LoDTensor&);
+  explicit TensorArray(size_t size);
+
+ private:
+  vector<LoDTensor> values_;
+};
+```
+
+We also need to expose it to PaddlePaddle's Python API,
+because users would want to use it with our very flexible operators `WhileLoop`.
+An example for a RNN based on dynamic operators is 
+
+```python
+input = pd.data(...)
+num_steps = Var(12)
+
+TensorArray states(size=num_steps)
+TensorArray step_inputs(unstack_from=input)
+TensorArray step_outputs(size=num_steps)
+
+W = Tensor(...)
+U = Tensor(...)
+default_state = some_op()
+
+step = Var(1)
+
+wloop = paddle.create_whileloop(loop_vars=[step])
+with wloop.frame():
+    wloop.break_if(pd.equal(step, num_steps)
+    pre_state = states.read(step-1, default_state)
+    step_input = step_inputs.read(step)
+    state = pd.sigmoid(pd.matmul(U, pre_state) + pd.matmul(W, step_input))
+    states.write(step, state)
+    step_outputs.write(step, state) # output state
+    step.update(state+1)
+
+output = step_outputs.stack()
+```
+
+## Background
+Steps are one of the core concepts of RNN. In each time step of RNN, there should be several input segments, states, and output segments; all these components act like arrays, for example, call `states[step_id]` will get the state in `step_id`th time step.
+
+An RNN can be implemented with the following pseudocode
+
+```c++
+Array states;
+Array input_segments;
+Array output_segments;
+Parameter W, U;
+
+step = 1
+seq_len = 12
+while_loop {
+   if (step == seq_len) break;
+    states[step] = sigmoid(W * states[step-1] + U * input_segments[step]);
+    output_segments[step] = states[step] // take state as output
+   step++;
+}
+```
+According to the [RNN roadmap](https://github.com/PaddlePaddle/Paddle/issues/4561), there are several different RNNs that PaddlePaddle will eventually support.
+
+Currently, the basic RNN implementation supported by PaddlePaddle is the `recurrent_op` which takes tensors as input and splits them into `input_segments`.
+
+
+Since a tensor cannot store variable-length sequences directly, PaddlePaddle implements the tensor with level of details (`LoDTensor` for short).
+Segmenting the `LoDTensor` is much more complicated than splitting a tensor, that makes it necessary to refactor the `recurrent_op` with `LoDTensor` segmenting support.
+
+As the next step in RNN support, `dynamic_recurrent_op` should be introduced to handle inputs with variable-length sequences.
+
+The implementation is similar to `recurrent_op`. 
+The key difference is the way **the original input `LoDTensors` and outupts are split to get the `input_segments` and the `output_segments`.**
+
+
+Though it can't be built over `recurrent_op` or `dynamic_recurrent_op` directly,
+the logic behind splitting a tensor or a LoD tensor into `input_segments` remains the same.
+
+## Why `TensorArray`
+The logic behind splitting the inputs to segments, states and outputs is similar and can be shared in a seperate module.
+
+The array of `states`, `input_segments` and `output_segments` would be exposed to users when writing a dynamic RNN model similar to the above pseudo codes. 
+
+So there should be an array-like container, which can store the segments of a tensor or LoD tensor.
+
+**This container can store an array of tensors and provides several methods to split a tensor or a LoD tensor** .
+This is where the notion of `TensorArray` comes from.
+
+## Introduce TensorArray to uniform all the three RNNs
 TensorArray as a new concept is borrowed from TensorFlow, 
 it is meant to be used with dynamic iteration primitives such as `while_loop` and `map_fn`.
 
 This concept can be used to support our new design of dynamic operations, and help to refactor some existing variant-sentence-related layers, 
-such as `RecurrentGradientMachine`.
+such as `recurrent_op`, `RecurrentGradientMachine`.
 
 In [our design for dynamic RNN](https://github.com/PaddlePaddle/Paddle/pull/4401), 
 `TensorArray` is used to segment inputs and store states in all time steps.
 By providing some methods similar to a C++ array,
-the definition of some state-based dynamic models such as RNN could be more natural and highly flexible.
-
-## Dynamic-Related Methods
-Some basic methods should be proposed as follows:
-
-### stack()
-Pack the values in a `TensorArray` into a tensor with rank one higher than each tensor in `values`.
-### unstack(axis=0)
-Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
-### concat()
-Return the values in the `TensorArray` as a concatenated Tensor.
-### write(index, value, data_shared=true)
-Write value into index of the TensorArray.
-### read(index)
-Read the value at location `index` in the `TensorArray`.
-### size()
-Return the number of values.
+the definition of some state-based dynamic models such as RNN can be more natural and highly flexible.
+
+## Dynamic-operations on TensorArray
+
+`TensorArray` will be used directly when defining dynamic models, so some operators listed below should be implemented
+
+```python
+# several helper operators for TensorArray
+def tensor_array_stack(ta, tensor):
+    '''
+    get a tensor array `ta`, return a packed `tensor`.
+    '''
+    pass
+
+def tensor_array_unstack(tensor, ta):
+    '''
+    get a `tensor`, unstack it and get a tensor array `ta`.
+    '''
+    pass
+
+def tensor_array_write(ta, index, tensor, data_shared):
+    '''
+    get a `tensor` and a scalar tensor `index`, write `tensor` into index-th
+    value of the tensor array `ta`.
+    `data_shared` is an attribute that specifies whether to copy or reference the tensors.
+    '''
+    pass
+
+def tensor_array_read(ta, index, tensor):
+    '''
+    get a tensor array `ta`, a scalar tensor `index`, read the index-th value of
+    `ta` and return as the `tensor`.
+    '''
+    pass
+
+def tensor_array_size(ta, tensor):
+    '''
+    get a tensor array `ta`, return the size of `ta` and return as the scalar `tensor`.
+    '''
+    pass
+```
+
+It is trivial for users to use so many low-level operators, so some helper methods should be proposed in python wrapper to make `TensorArray` easier to use, 
+for example
+
+```python
+class TensorArray:
+    def __init__(self, name):
+        self.name = name
+        self.desc = TensorArrayDesc()
+
+    def stack(self, name=None):
+        '''
+        Pack the values in a `TensorArray` into a tensor with rank one higher
+        than each tensor in `values`.
+        `stack` can be used to split tensor into time steps for RNN or whileloop.
+
+        @name: str
+            the name of the variable to output.
+        '''
+        tensor = Var(name)
+        tensor_array_stack(self.name, tensor)
+        return tensor
+
+    def unstack(self, input):
+        '''
+        Unpacks the given dimension of a rank-`R` tensor into rank-`(R-1)` tensors.
+        `unstack` can be used to concatenate all the time steps for RNN or whileloop.
+
+        @input: str
+            the name of input tensor
+        '''
+        tensor_array_unstack(tensor, self.name)
+
+    def write(self, index, value, data_shared=True):
+        '''
+        Write value into index of the TensorArray.
+        If `data_shared` is set to True, than the index-th value in TensorArray will
+        be shared with the tensor passed in.
+
+        @index: str
+            name of a scalar tensor
+        @value: str
+            name of a tensor
+        @data_shared: bool
+        '''
+        tensor_array_write(self.name, index, value, data_shared)
+
+    def read(self, index, output):
+        '''
+        Read the value at location `index` in the `TensorArray`.
+
+        @index: str
+            name of a scalar tensor
+        @output:
+            name of a output variable
+        '''
+        tensor_array_read(self.name, index, output)
+
+
+    def size(self, output):
+        '''
+        Return the number of values.
+
+        @output: str
+            name of a scalar tensor
+        '''
+        tensor_array_size(self.name, output)
+```
 
 ## LoDTensor-related Supports
-The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes variant length sequences as input, 
-because each step of RNN could only take a tensor-represented batch of data as input, 
+The `RecurrentGradientMachine` in Paddle serves as a flexible RNN layer; it takes varience-length sequences as input, and output sequences too.
+
+Since each step of RNN can only take a tensor-represented batch of data as input, 
 some preprocess should be taken on the inputs such as sorting the sentences by their length in descending order and cut each word and pack to new batches.
 
-Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`.
+Such cut-like operations can be embedded into `TensorArray` as general methods called `unpack` and `pack`,
+these two operations are similar to `stack` and `unstack` except that they operate on variable-length sequences formated as a LoD tensor rather than a tensor.
+
+Some definitions are like
+
+```python
+def unpack(level):
+    '''
+    Split LodTensor in some `level` and generate batches, if set `sort_by_length`,
+    will sort by length.
 
-With these two methods, a variant-sentence-RNN can be implemented like
+    Returns:
+        - a new `TensorArray`, whose values are LodTensors and represents batches
+          of data.
+        - an int32 Tensor, which stores the map from the new batch's indices to
+          original LoDTensor
+    '''
+    pass
+
+def pack(level, indices_map):
+    '''
+    Recover the original LoD-arranged LoDTensor with the values in a `TensorArray`
+    and `level` and `indices_map`.
+    '''
+    pass
+```
+
+With these two methods, a varience-length sentence supported RNN can be implemented like
 
 ```c++
 // input is the varient-length data
@@ -58,16 +269,3 @@ LoDTensor rnn_output = ta.pack(ta, indice_map);
 ```
 the code above shows that by embedding the LoDTensor-related preprocess operations into `TensorArray`,
 the implementation of a RNN that supports varient-length sentences is far more concise than `RecurrentGradientMachine` because the latter mixes all the codes together, hard to read and extend.
-
-
-some details are as follows.
-
-### unpack(level, sort_by_length)
-Split LodTensor in some `level` and generate batches, if set `sort_by_length`, will sort by length.
-
-Returns:
-
-- a new `TensorArray`, whose values are LodTensors and represents batches of data.
-- an int32 Tensor, which stores the map from the new batch's indices to original LoDTensor
-### pack(level, indices_map)
-Recover the original LoD-arranged LoDTensor with the values in a `TensorArray` and `level` and `indices_map`.
diff --git a/doc/design/test.dot b/doc/design/test.dot
new file mode 100644
index 0000000000000000000000000000000000000000..62c69b8fc8010a26a54a6ee8ef1488aad94d747a
--- /dev/null
+++ b/doc/design/test.dot
@@ -0,0 +1,35 @@
+
+digraph Test {
+    z -> generator -> G_img;
+    G_img -> discriminator -> D_f -> d_loss_f;
+    label0 -> d_loss_f -> d_loss;
+
+    img -> discriminator -> D_t -> d_loss_t;
+    label1 -> d_loss_t -> d_loss;
+
+    d_loss -> d_loss_t[color=red, style=dashed];
+    d_loss -> d_loss_f[color=red, style=dashed];
+    d_loss_t -> D_t[color=red, style=dashed];
+    d_loss_f -> D_f[color=red, style=dashed];
+    D_t -> discriminator[color=red, style=dashed];
+    D_f -> discriminator[color=red, style=dashed];
+
+    D_f -> g_loss;
+    label2 -> g_loss;
+
+    g_loss -> D_f[color=green, style=dashed];
+    D_f -> discriminator[color=green, style=dashed];
+    discriminator -> G_img[color=green, style=dashed];
+    G_img -> generator[color=green, style=dashed];
+
+    discriminator [color=red, shape=box];
+    generator [color=green, shape=box];
+    z [shape=diamond];
+    img [shape=diamond];
+    label0 [shape=diamond];
+    label1 [shape=diamond];
+    label2 [shape=diamond];
+
+    d_loss [color=red];
+    g_loss [color=green];
+}
diff --git a/doc/design/test.dot.png b/doc/design/test.dot.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e121a40b9f7b2232d7cdda315bad15926446f55
Binary files /dev/null and b/doc/design/test.dot.png differ
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
index bfbbdd0578ebc69ea4b49ade9b041573a9e9ad55..0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f 100644
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -16,16 +16,23 @@ The computation graph is constructed by Data Node and Operation Node. The concep
 
 ## Definition of VarDesc
 
-A VarDesc should have a name and value, in PaddlePaddle, the value will always be a tensor. Since we use LoDTensor most of the time. We add a LoDTesnorDesc to represent it.
+A VarDesc should have a name, and value. The are two kinds of variable type in compile time, they are `LoDTensor` and `SelectedRows`. 
 
 ```proto
 message VarDesc {
   required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
+  enum VarType {
+    LOD_TENSOR = 0;
+    SELECTED_ROWS = 1;
+  }
+  required VarType type = 2;
+  optional LoDTensorDesc lod_desc = 3;
+  optional TensorDesc selected_rows_desc = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 ```
 
-## Definition of LodTensorDesc
+## Definition of TensorDesc
 
 ```proto
 enum DataType {
@@ -38,87 +45,25 @@ enum DataType {
   FP64 = 6;
 }
 
-message LoDTensorDesc {
+message TensorDesc {
   required DataType data_type = 1;
-  repeated int32 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [default=0];
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
 }
 ```
 
-## Definition of Variable in Python
-
-In Python API, layer will take Variable as Input, and return Variable as Output. There should be a class `Variable` in python to help create and manage Variable.
-
-```python
-image = Variable(dims=[-1, 640, 480])
-# fc1 and fc2 are both Variable
-fc1 = layer.fc(input=image, output_size=10)
-fc2 = layer.fc(input=fc1, output_size=20)
-```
-### what should class `Variable` Have
-1. `name`.a name of string type is used to mark the value of the Variable.
-1. `initializer`. Since our Tensor does not have value. we will always use some Operator to fullfill it when run. So we should have a initialize method to help add the init operator.
-1. `operator`. Variable should record which operator produce itself. The reaon is:
-  - we use pd.eval(targets=[var1, var2]) to run the related ops to get the value of var1 and var2. var.op is used to trace the dependency of the current variable.
-
-In PaddlePaddle, we use Block to describe Computation Graph, so in the code we will use Block but not Graph.
-
-```python
-import VarDesc
-import LoDTensorDesc
-import framework
-
-def AddInitialOperator(variable, initializer):
-	# add an initialize Operator to block to init this Variable
-
-class Variable(object):
-   def __init__(self, name, dims, type, initializer):
-      self._block = get_default_block()
-      self._name = name
-      self.op = None
-
-      tensor_desc = LoDTensorDesc(data_type=type, dims=dims)
-      _var_desc = VarDesc(name=name, lod_tensor=tensor_desc)
-      self._var = framework.CreateVar(_var_desc)
-      self._block.add_var(self)
+A TensorDesc describes `SelectedRows` and `LoDTensor`. For details of `SelectedRows`, please reference [`SelectedRows`](./selected_rows.md).
 
-      # add initial op according to initializer
-      if initializer is not None:
-          AddInitialOperator(self, initializer)
-
-   def dims(self):
-      return self._var.dims()
-
-   def data_type(self):
-       return self._var.data_type()
+## Definition of LodTensorDesc
 
-   def to_proto(self):
-       pass
+```proto
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int lod_level = 2;
+}
 ```
 
-Then we can use this Variable to create a fc layer in Python.
+A LoDTensorDesc contains a tensor and a lod_level.
 
-```python
-import paddle as pd
-
-def flatten_size(X, num_flatten_dims):
-  prod = 1 # of last num_flatten_dims
-  for i in xrange(num_flatten_dims):
-    prod = prod * X.dims[-i-1]
-  return prod
-
-def layer.fc(X, output_size, num_flatten_dims):
-  W = Variable(pd.random_uniform(), type=FP32, dims=[flatten_size(X, num_flatten_dims), output_size])
-  b = Variable(pd.random_uniform(), type=FP32, dims=[output_size])
-  out = Variable(type=FP32)
-  y = operator.fc(X, W, b, output=out) # fc will put fc op input into out
-  pd.InferShape(y)
-  return out
-
-x = Variable(dims=[-1, 640, 480])
-y = layer.fc(x, output_size=100)
-z = layer.fc(y, output_size=200)
+## Definition of Variable in Python
 
-paddle.eval(targets=[z], ...)
-print(z)
-```
+For Variable in Python, please reference [`Python API`](./python_api.md).
diff --git a/doc/faq/local/index_cn.rst b/doc/faq/local/index_cn.rst
index 75c4ba028e497e29e9030a86514348726d9c0a80..0e939a2671ace8682c90cdc1c1bb2da1dda0d568 100644
--- a/doc/faq/local/index_cn.rst
+++ b/doc/faq/local/index_cn.rst
@@ -174,7 +174,7 @@ decoder_inputs = paddle.layer.fc(
 1. 两者都是对梯度的截断，但截断时机不同，前者在 :code:`optimzier` 更新网络参数时应用；后者在激活函数反向计算时被调用；
 2. 截断对象不同：前者截断可学习参数的梯度，后者截断回传给前层的梯度;
 
-除此之外，还可以通过减小学习律或者对数据进行归一化处理来解决这类问题。
+除此之外，还可以通过减小学习率或者对数据进行归一化处理来解决这类问题。
 
 5.  如何调用 infer 接口输出多个layer的预测结果
 -----------------------------------------------
diff --git a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
index 90dc84718c9ce1374cda6022de177afeeb60279d..1fc58c37cc9151d5e4d99b939e30c29aa99e04f1 100644
--- a/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
+++ b/doc/howto/cross_compiling/cross_compiling_for_android_cn.md
@@ -1,9 +1,46 @@
 # 构建Android平台上的PaddlePaddle库
 
-用户可通过交叉编译的方式，在用户熟悉的开发平台（Linux，Mac OS X和Windows）上编译Android平台上适用的PaddlePaddle库。
+用户可通过如下两种方式，交叉编译Android平台上适用的PaddlePaddle库：
+- 基于Docker容器的编译方式
+- 基于Linux交叉编译环境的编译方式
+
+## 基于Docker容器的编译方式
+Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行，因此，使用基于Docker容器的编译方式，用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
+
+### 构建PaddlePaddle的Android开发镜像
+我们把PaddlePaddle的交叉编译环境打包成一个镜像，称为开发镜像，里面涵盖了交叉编译Android版PaddlePaddle库需要的所有编译工具。
+
+```bash
+$ git clone https://github.com/PaddlePaddle/Paddle.git
+$ cd Paddle
+$ docker build -t username/paddle-android:dev . -f Dockerfile.android
+```
+
+### 编译PaddlePaddle C-API库
+构建好开发镜像后，即可使用开发镜像来编译Android版PaddlePaddle C-API库。
+Android的Docker开发镜像向用户提供两个可配置的参数：
+
+| Argument        | Optional Values         | Default |
+|-----------------|-------------------------|---------|
+|`ANDROID_ABI`    |`armeabi-v7a, arm64-v8a` | `armeabi-v7a` |
+|`ANDROID_API`    |`>= 21` | `21` |
+
+- 编译`armeabi-v7a`，`Android API 21`的PaddlePaddle库
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
+```
+
+- 编译`arm64-v8a`，`Android API 21`的PaddlePaddle库
+```bash
+$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
+```
+
+执行上述`docker run`命令时，容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置，并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`，`ANDROID_API<21`时，Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节，根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后，PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录，所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+
+## 基于Linux交叉编译环境的编译方式
 本文档将以Linux x86-64平台为例，介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
 
-## 准备交叉编译环境
+### 准备交叉编译环境
 
 从源码交叉编译PaddlePaddle，用户需要提前准备好交叉编译环境。Android平台上使用的C/C++交叉编译工具链为[Android NDK](https://developer.android.com/ndk/downloads/index.html?hl=zh-cn)，用户可自行前往下载预编译好的版本，也可通过以下命令获取：
 
@@ -13,18 +50,27 @@ unzip -q android-ndk-r14b-linux-x86_64.zip
 ```
 
 Android NDK中包含了所有Android API级别、所有架构（arm/arm64/x86/mips）需要用到的编译工具和系统库。用户可根据自己的编译目标架构、所需支持的最低Android API级别，构建[独立工具链](https://developer.android.google.cn/ndk/guides/standalone_toolchain.html?hl=zh-cn)。
-比如：
+
+- 构建`armeabi-v7a`、 `Android API 21`的独立工具链：
 
 ```bash
 your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
-        --arch=arm --platform=android-21 --install-dir=your/path/to/my_standalone_toolchain
+        --arch=arm --platform=android-21 --install-dir=your/path/to/arm_standalone_toolchain
 ```
 
-此命令将在your/path/to/my_standalone_toolchain目录生成一套编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，使用的编译器为arm-linux-androideabi-gcc (GCC) 4.9。
+此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链，面向架构为32位ARM架构，支持的最小的Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
 
-注意：**PaddlePaddle要求使用的编译工具链所支持的Andoid API级别不小于21**。
+- 构建`arm64-v8a`、 `Android API 21`的独立工具链：
+```bash
+your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
+        --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
+```
 
-## 配置交叉编译参数
+此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链，面向架构为64位ARM64架构，支持的最小Android API级别为21，支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
+
+注意：**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
+
+### 配置交叉编译参数
 
 CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置，PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake)，以提供一些默认的编译器和编译参数相关配置。注意，从CMake 3.7版本开始，CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时，将会将用户传进来的配置参数传递CMake系统，交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
 
@@ -36,32 +82,57 @@ CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cm
 Android平台可选配置参数：
 
 - `ANDROID_STANDALONE_TOOLCHAIN`，独立工具链所在的绝对路径，或者相对于构建目录的相对路径。PaddlePaddle的CMake系统将根据该值自动推导和设置需要使用的交叉编译器、sysroot、以及Android API级别；否则，用户需要在cmake时手动设置这些值。无默认值。
-- `ANDROID_ABI`，目标架构ABI。目前只支持`armeabi-v7a`，默认值为`armeabi-v7a`。
+- `ANDROID_TOOLCHAIN`，目标工具链。可设置`gcc/clang`，默认值为`clang`。
+	- CMake 3.7以上，将会始终使用`clang`工具链；CMake 3.7以下，可设置`ANDROID_TOOLCHAIN=gcc`以使用`gcc`工具链。
+	- Android官方提供的`clang`编译器要求系统支持`GLIBC 2.15`以上。
+- `ANDROID_ABI`，目标架构ABI。目前支持`armeabi-v7a`和`arm64-v8a`，默认值为`armeabi-v7a`。
 - `ANDROID_NATIVE_API_LEVEL`，工具链的Android API级别。若没有显式设置，PaddlePaddle将根据`ANDROID_STANDALONE_TOOLCHAIN`的值自动推导得到。
-- `ANROID_ARM_MODE`，是否使用ARM模式。可设置`ON/OFF`，默认值为`ON`。
-- `ANDROID_ARM_NEON`，是否使用NEON指令。目前必须设置成`ON`，默认值为`ON`。
+- `ANROID_ARM_MODE`，是否使用ARM模式。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
+- `ANDROID_ARM_NEON`，是否使用NEON指令。
+	- `ANDROID_ABI=armeabi-v7a`时，可设置`ON/OFF`，默认值为`ON`；
+	- `ANDROID_ABI=arm64-v8a`时，不需要设置。
 
 其他配置参数：
 
+- `USE_EIGEN_FOR_BLAS`，是否使用Eigen库进行矩阵计算。可设置`ON/OFF`，默认值为`OFF`。
 - `HOST_C/CXX_COMPILER`，宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值；若环境变量`CC`没有设置，则设置成`cc`编译器。
 
-一种常用的cmake配置如下：
+常用的cmake配置如下：
 
 ```bash
 cmake -DCMAKE_SYSTEM_NAME=Android \
-      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/my_standalone_toolchain \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm_standalone_toolchain \
       -DANDROID_ABI=armeabi-v7a \
       -DANDROID_ARM_NEON=ON \
       -DANDROID_ARM_MODE=ON \
+      -DUSE_EIGEN_FOR_BLAS=ON \
       -DCMAKE_INSTALL_PREFIX=your/path/to/install \
       -DWITH_C_API=ON \
       -DWITH_SWIG_PY=OFF \
       ..
 ```
 
+```
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=your/path/to/arm64_standalone_toolchain \
+      -DANDROID_ABI=arm64-v8a \
+      -DUSE_EIGEN_FOR_BLAS=OFF \
+      -DCMAKE_INSTALL_PREFIX=your/path/to/install \  
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      ..
+```
+
 用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小，可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`；若希望最快的执行速度，则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
 
-## 编译和安装
+**性能TIPS**，为了达到最快的计算速度，在CMake参数配置上，有以下建议：
+- 设置`CMAKE_BUILD_TYPE`为`Release`
+- 使用`clang`编译工具链
+- `armeabi-v7a`时，设置`USE_EIGEN_BLAS=ON`，使用Eigen进行矩阵计算；`arm64-v8a`时，设置`USE_EIGEN_FOR_BLAS=OFF`，使用OpenBLAS进行矩阵计算
+
+### 编译和安装
 
 CMake配置完成后，执行以下命令，PaddlePaddle将自动下载和编译所有第三方依赖库、编译和安装PaddlePaddle预测库。
 
@@ -72,4 +143,4 @@ make install
 
 注意：如果你曾经在源码目录下编译过其他平台的PaddlePaddle库，请先使用`rm -rf`命令删除`third_party`目录和`build`目录，以确保所有的第三方依赖库和PaddlePaddle代码都是针对新的CMake配置重新编译的。
 
-执行完安装命令后，`your/path/to/install`目录中会包含`include`和`lib`目录，其中`include`中包含C-API的头文件，`lib`中包含一个Android版本的库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
+执行完安装命令后，`your/path/to/install`目录中会包含`include`、`lib`和`third_party`目录，其中`include`中包含C-API的头文件，`lib`中包含若干个不同Android ABI的PaddlePaddle库，`third_party`中包含所依赖的所有第三方库。自此，PaddlePaddle的已经安装完成，用户可将`your/path/to/install`目录下的生成文件用于深度学习相关Android App中，调用方法见C-API文档。
diff --git a/doc/howto/deep_model/rnn/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
index 4d684cf8ad5a8082cf31fb27027119b3d3e700b6..63fa161fafed0f3a8ec8799af21304cbec62d813 100644
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -21,7 +21,7 @@ wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
       :align: center
 
 一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
@@ -96,7 +96,7 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
       :align: center
 
 在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
diff --git a/doc/howto/deep_model/rnn/rnn_config_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
index 2b581290a41005c04cb1d8b6febe57f17d2416d3..f92edd108ff5c10a31b5f181f0f6dcb7a3f119f3 100644
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -19,7 +19,7 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+.. image:: src/bi_lstm.jpg
      :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
@@ -78,7 +78,7 @@ Sequence to Sequence Model with Attention
 -----------------------------------------
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
-.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+.. image:: src/encoder-decoder-attention-model.png
       :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
diff --git a/doc/tutorials/sentiment_analysis/bi_lstm.jpg b/doc/howto/deep_model/rnn/src/bi_lstm.jpg
similarity index 100%
rename from doc/tutorials/sentiment_analysis/bi_lstm.jpg
rename to doc/howto/deep_model/rnn/src/bi_lstm.jpg
diff --git a/doc/tutorials/text_generation/encoder-decoder-attention-model.png b/doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
similarity index 100%
rename from doc/tutorials/text_generation/encoder-decoder-attention-model.png
rename to doc/howto/deep_model/rnn/src/encoder-decoder-attention-model.png
diff --git a/doc/howto/dev/new_op_cn.md b/doc/howto/dev/new_op_cn.md
index 264b998f50df016da0741d97d4b26f759ee90900..c823d7e9fcd63dd7719ac1403952b03c2d2f03c0 100644
--- a/doc/howto/dev/new_op_cn.md
+++ b/doc/howto/dev/new_op_cn.md
@@ -206,7 +206,7 @@ MulOp(const std::string &type, const framework::VariableNameMap &inputs,
 
     - `REGISTER_OP` ： 注册`ops::MulOp`类，类型名为`mul`，该类的`ProtoMaker`为`ops::MulOpMaker`，注册`ops::MulOpGrad`，类型名为`mul_grad`。
     - `REGISTER_OP_WITHOUT_GRADIENT` ： 用于注册没有反向的Op。
-    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulKernel`类。
+    - `REGISTER_OP_CPU_KERNEL` ：注册`ops::MulKernel`类，并特化模板参数为`paddle::platform::CPUPlace`和`float`类型，同理，注册`ops::MulGradKernel`类。
 
 
 - 在 `.cu`文件中注册GPU Kernel。
@@ -285,41 +285,27 @@ class TestMulGradOp(GradientChecker):
             'Y': np.random.random((84, 100)).astype("float32")
         }
 
-    def test_cpu_gpu_compare(self):
-        self.compare_grad(self.op, self.inputs)
-
-    def test_normal(self):
+    def test_check_grad_normal(self):
         # mul op will enlarge the relative error
-        self.check_grad(
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 ```
 
 下面解释代码中一些关键的地方:
 
 - 调用`create_op("mul")`创建反向Op对应的前向Op。
-- 调用`compare_grad`函数对比CPU、GPU计算结果。
-- `test_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
-  - 第一个参数`self.op` : 前向Op。
-  - 第二个参数`self.inputs` : 输入词典，词典的Key和`ProtoMaker`定义保持一致。
-  - 第三个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
-  - 第四个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`
-- `test_ignore_x`和`test_ignore_y`分支用来测试只需要计算一个输入梯度的情况。
+- `test_check_grad_normal`中调用`check_grad`使用数值法检测梯度正确性和稳定性。
+  - 第一个参数`["X", "Y"]` : 指定对输入变量`X`、`Y`做梯度检测。
+  - 第二个参数`"Out"` : 指定前向网络最终的输出目标变量`Out`。
+  - 第三个参数`max_relative_error`：指定检测梯度时能容忍的最大错误值。
+- `test_check_grad_ingore_x`和`test_check_grad_ingore_y`分支用来测试只需要计算一个输入梯度的情况。
 
 
 ### 编译和执行单元测试
diff --git a/doc/howto/dev/new_op_en.md b/doc/howto/dev/new_op_en.md
index bad1dbc1de9cc5bd11914fddf397857f0bda7976..1e88e1f5b4df710f1b69f0305d8d8a2921c4249a 100644
--- a/doc/howto/dev/new_op_en.md
+++ b/doc/howto/dev/new_op_en.md
@@ -205,7 +205,7 @@ The definition of its corresponding backward operator, if applicable, is similar
 
     - `REGISTER_OP` registers the `ops::MulOp` class, type named `mul`, its type `ProtoMaker` is `ops::MulOpMaker`, registering `ops::MulOpGrad` as `mul_grad`.
     - `REGISTER_OP_WITHOUT_GRADIENT` registers an operator without gradient.
-    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulKernel`.
+    - `REGISTER_OP_CPU_KERNEL` registers `ops::MulKernel` class and specialized template types `paddle::platform::CPUPlace` and `float`, which also registers `ops::MulGradKernel`.
 
 
 - Registering GPU Kernel in `.cu` files
@@ -293,41 +293,27 @@ class TestMulGradOp(GradientChecker):
             'Y': np.random.random((84, 100)).astype("float32")
         }
 
-    def test_cpu_gpu_compare(self):
-        self.compare_grad(self.op, self.inputs)
-
-    def test_normal(self):
+    def test_check_grad_normal(self):
         # mul op will enlarge the relative error
-        self.check_grad(
-            self.op, self.inputs, ["X", "Y"], "Out", max_relative_error=0.5)
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
 
-    def test_ignore_x(self):
+    def test_check_grad_ingore_x(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["Y"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"X"})
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
 
-    def test_ignore_y(self):
+    def test_check_grad_ingore_y(self):
         self.check_grad(
-            self.op,
-            self.inputs, ["X"],
-            "Out",
-            max_relative_error=0.5,
-            no_grad_set={"Y"})
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
 ```
 
 Some key points in the code above include:
 
 - `create_op("mul")` creates the backward operator's corresponding forward operator.
-- `compare_grad` compares results between utilizing the CPU and the GPU.
 - `test_normal` calls `check_grad` to validate scaling tests' correctness and stability through numeric methods.
-  - The first variable `self.op` denotes the forward operator.
-  - The second variable `self.inputs` denotes the input dictionary, which has its key value identical to its `ProtoMaker` definitions.
-  - The third variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
-  - The fourth variable `"Out"` points to the network's final output target `Out`.
-- `test_ignore_x` and `test_ignore_y`branches test the cases where there is only one scaling input.
+  - The first variable `["X", "Y"]` appoints `X` and `Y` to be scale tested.
+  - The second variable `"Out"` points to the network's final output target `Out`.
+  - The third variable `max_relative_error` points to the maximum relative tolerance error during scaling tests.
+- `test_check_grad_ingore_x` and `test_check_grad_ingore_y`branches test the cases where there is only one scaling input.
 
 ### Compiling and Running
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 274452fbf0c595ad7b4dbeffe85ad9038f12b458..93c5544bcfa911f8bdcdaea39a75b3ab7ef218f8 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -1,135 +1,215 @@
-```eval_rst
-.. _cluster_train:
+# PaddlePaddle分布式训练
+
+* [概述](#概述)
+* [环境准备](#环境准备)
+* [启动参数说明](#启动参数说明)
+  * [启动参数服务器](#启动参数服务器)
+  * [启动计算节点](#启动计算节点)
+  * [准备数据集](#准备数据集)
+  * [准备训练程序](#准备训练程序)
+* [使用分布式计算平台或工具](#使用分布式计算平台或工具)
+  * [使用Fabric启动集群作业](#使用fabric启动集群作业)
+     * [准备一个Linux集群](#准备一个linux集群)
+     * [启动集群作业](#启动集群作业)
+     * [终止集群作业](#终止集群作业)
+     * [检查集群训练结果](#检查集群训练结果)
+     * [检查模型输出](#检查模型输出)
+  * [在OpenMPI集群中提交训练作业](#在openmpi集群中提交训练作业)
+     * [准备OpenMPI集群](#准备OpenMPI集群)
+     * [启动集群作业](#启动集群作业-1)
+  * [在Kubernetes集群中提交训练作业](#在kubernetes集群中提交训练作业)
+
+# 概述
+本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示：
+
+<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
+
+- 数据分片（Data shard): 用于训练神经网络的数据，被切分成多个部分，每个部分分别给每个trainer使用。
+- 计算节点（Trainer）: 每个trainer启动后读取切分好的一部分数据，开始神经网络的“前馈”和“后馈”计算，并和参数服务器通信。在完成一定量数据的训练后，上传计算得出的梯度（gradients），然后下载优化更新后的神经网络参数（parameters）。
+- 参数服务器（Parameter server）:每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度，并完成参数优化更新，再将更新后的参数下发到每个计算节点。
+
+这样，通过计算节点和参数服务器的分布式协作，可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降（SGD）和异步随机梯度下降。
+
+在使用同步SGD训练神经网络时，PaddlePaddle使用同步屏障（barrier），使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中，则并不会等待所有trainer提交梯度才更新参数，这样极大地提高了计算的并行性：参数服务器之间不相互依赖，并行地接收梯度和更新参数，参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步，计算节点之间也不会相互依赖，并行地执行模型的训练。可以看出，虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新，在任意时间某一台参数服务器上保存的参数可能比另一台要更新，与同步SGD相比，梯度会有噪声。
+
+# 环境准备
+
+1. 准备您的计算集群。计算集群通常由一组（几台到几千台规模）的Linux服务器组成。服务器之间可以通过局域网（LAN）联通，每台服务器具有集群中唯一的IP地址（或者可被DNS解析的主机名）。集群中的每台计算机通常被成为一个“节点”。
+1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install)的多种安装方式。我们推荐使用[Docker](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)安装方式来快速安装PaddlePaddle。
+
+安装完成之后，执行下面的命令可以查看已经安装的版本（docker安装方式可以进入docker容器执行：`docker run -it paddlepaddle/paddle:[tag] /bin/bash`）：
+```bash
+$ paddle version
+PaddlePaddle 0.10.0, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
 ```
 
-# 运行分布式训练
+下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例，介绍使用PaddlePaddle v2 API完成分布式训练。
 
-在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
+# 启动参数说明
+## 启动参数服务器
+执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+```
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
+如果希望可以在后台运行pserver程序，并保存输出到一个日志文件，可以运行：
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
 
-## 前提条件
+| 参数  | 是否必选 | 默认值 | 说明 |
+| ------------- | ------------- | ------------- | ------------- |
+| port  | 必选 | 7164 | pserver监听的起始端口，根据ports_num决定<br>总端口个数，从起始端口监听多个端口用于通信  |
+| ports_num  | 必选 | 1 | 监听的端口个数  |
+| ports_num_for_sparse  | 必选 | 1 | 用于稀疏类型参数通信的端口个数  |
+| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+
+## 启动计算节点
+执行以下命令启动使用python编写的trainer程序（文件名为任意文件名，如train.py）
+```bash
+$ python train.py
+```
 
-1. 上述脚本使用 Python 库 [fabric](http://www.fabfile.org/) 来运行 SSH 命令。 我们使用 `pip` 来安装 fabric:
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量（https://zh.wikipedia.org/wiki/环境变量 ）或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量，将会优先使用`paddle.init()`中传入的参数。
 
-   ```bash
-   pip install fabric
-   ```
+使用环境变量：
 
-2. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU，需要在 `/usr/local/cuda` 中安装 CUDA; 否则 Paddle 将在运行时报错。
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+```
 
-3. 在 [`cluster_train/conf.py`] 中设置 `ROOT_DIR`， 该 ROOT_DIR 要在所有节点上存在。为了方便起见，我们通常在所有节点上创建一个 Unix 用户 `paddle`，并设置 `ROOT_DIR=/home/paddle`。这样，我们可以将 SSH 公钥写入 `/home/paddle/.ssh/authorized_keys`，以便用户 `paddle` 可以 SSH 到所有节点而不用密码。
+使用参数：
 
-## 准备工作空间
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
+```
 
-我们将放置依赖库、配置等文件的目录视为 *工作空间（workspace）*。
+| 参数  | 是否必选 | 默认 | 说明 |
+| ------------- | ------------- | ------------- | ------------- |
+| use_gpu  | 可选 | False | 是否启用GPU训练 |
+| trainer_count  | 必选 | 1 | 当前训练任务trainer总个数 |
+| port  | 必选 | 7164 | 连接到pserver的端口  |
+| ports_num  | 必选 | 1 | 连接到pserver的端口个数  |
+| ports_num_for_sparse  | 必选 | 1 | 和pserver之间用于稀疏类型参数通信的端口个数  |
+| num_gradient_servers  | 必选 | 1 | 当前训练任务pserver总数 |
+| trainer_id  | 必选 | 0 | 每个trainer的唯一ID，从0开始的整数 |
+| pservers  | 必选 | 127.0.0.1 | 当前训练任务启动的pserver的IP列表，多个IP使用“,”隔开 |
 
-这些 `train/test` 数据应该在启动集群作业之前准备好。 为了满足训练/测试数据放置在工作空间中不同目录的要求，PADDLE 根据在模型配置文件中使用的名为 `train.list/test.list` 的索引文件引用训练/测试数据，所以训练/测试数据也包含 train.list/test.list 两个列表文件。所有本地训练 demo 已经提供了脚本来帮助您创建这两个文件，并且集群作业中的所有节点将在正常情况下处理具有相同逻辑代码的文件。
 
-通常，你可以使用本地训练中的相同模型文件进行集群训练。请记住，在模型文件的 `setting`函数中设置的 `batch_size` 表示在集群作业**每个**节点中的 batch 大小，而不是使用同步 SGD 的总 batch 大小。
+## 准备数据集
 
-以下步骤基于 demo 目录中的 [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation)。
+参考样例数据准备脚本[prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)，准备训练数据和验证数据集，我们使用paddle.dataset.imikolov数据集，并根据分布式训练并发数（trainer节点个数），在`prepare.py`开头部分指定`SPLIT_COUNT`将数据切分成多份。
 
-你只需完成 demo/recommendation 教程文档到 `Train` 的部分，之后你会得到训练/测试数据和模型配置文件。最后，只需使用 demo/recommendation 作为集群训练的工作空间。
+在线上系统中，通常会使用MapReduce任务的输出结果作为训练结果，这样训练文件的个数会比较多，而且个数并不确定。在trainer中可以使用下面取模的方法为每个trainer分配训练数据文件：
 
-最后，你的工作空间应如下所示：
-```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
 ```
-虽然这些文件并非都需要集群训练，但是也没有必要删除无用的文件。
-
-`trainer_config.py`
-表示模型配置文件。
 
-`train.list` 和 `test.list`
-文件索引。它存储当前节点所有训练/测试数据的所有相对或绝对文件路径。
+示例程序`prepare.py`会把训练集和测试集分别分割成多个文件（例子中为3个，后缀为`-00000`、`-00001`和`-00002`）:
+```
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
+```
 
-`dataprovider.py`
-用于读取训练/测试样本。这与本地训练相同。
+在进行分布式训练时，每个trainer进程需要能够读取属于自己的一份数据。在一些分布式系统中，系统会提供一个分布式存储服务，这样保存在分布式存储中的数据可以被集群中的每个节点读取到。如果不使用分布式存储，则需要手动拷贝属于每个trainer节点的训练数据到对应的节点上。
 
-`data`
-数据目录中的所有文件被 train.list/test.list 引用。
+对于不同的训练任务，训练数据格式和训练程序的`reader()`会大不相同，所以开发者需要根据自己训练任务的实际场景完成训练数据的分割和`reader()`的编写。
 
+## 准备训练程序
 
-## 准备集群作业配置
+我们会对每个训练任务都会在每个节点上创建一个工作空间（workspace），其中包含了用户的训练程序、程序依赖、挂载或下载的训练数据分片。
 
-以下选项必须在 cluster_train/conf.py 中认真设置
+最后，工作空间应如下所示：
+```
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
 
-`HOSTS`  所有节点运行集群作业的主机名或 IP 。你还可以将用户和 ssh 端口附加到主机名上，例如 root@192.168.100.17:9090。
+- `my_lib.py`：会被`train.py`调用的一些用户定义的库函数，比如PIL库等。
+- `word_dict.pickle`：在`train.py`中会使用到的字典数据文件。
+- `train.py`：训练程序，代码参考[api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py)。***注意：*** 对于本样例代码，在使用不同的分布式计算平台时，您可能需要修改`train.py`开头的部分（如下），以便获得训练数据的位置和获取环境变量配置：
 
-`ROOT_DIR` 用于放置 JOB 工作空间目录的工作空间 ROOT 目录
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
 
-`PADDLE_NIC` 集群通信通道的 NIC(Network Interface Card, 网络接口卡) 接口名称，例如以太网的 eth0，infiniband 的 ib0。
+- `train_data_dir`：包含训练数据的目录，可以是从分布式存储挂载过来的，也可以是在任务启动前下载到本地的。
+- `test_data_dir`：包含测试数据集的目录。
 
-`PADDLE_PORT` 集群通信通道的端口号
+# 使用分布式计算平台或工具
 
-`PADDLE_PORTS_NUM` 用于集群通信通道的端口数。 如果集群节点数量少（少于5〜6个节点），建议将其设置为较大，如2〜8，以获得更好的网络性能。
+PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务，包括：
+- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架，支持大规模集群生产环境的完整集群方案。
+- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
+- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
 
-`PADDLE_PORTS_NUM_FOR_SPARSE` 用于 sparse remote updater 集群通信信道的端口数。如果使用 sparse remote update，则可以像 `PADDLE_PORTS_NUM` 一样设置。
+对于不同的集群平台，会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
 
-`LD_LIBRARY_PATH` 为集群作业设置额外的 LD_LIBRARY_PATH。你可以使用它来设置 CUDA 库的路径。
+在使用分布式计算平台进行训练时，任务被调度在集群中时，分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数，比如节点的ID、IP和任务节点个数等。
 
-默认配置如下：
+## 使用Fabric启动集群作业
 
-```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-工作空间配置
-'''
-
-#工作空间根目录
-ROOT_DIR = "/home/paddle"
-
-'''
-网络配置
-'''
-#pserver NIC
-PADDLE_NIC = "eth0"
-#pserver 端口
-PADDLE_PORT = 7164
-#pserver 端口数
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#集群作业中所有进程的环境设置
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
-```
+### 准备一个Linux集群
+可以在`paddle/scripts/cluster_train_v2/fabric/docker_cluster`目录下，执行`kubectl -f ssh_servers.yaml`启动一个测试集群，并使用`kubectl get po -o wide`获得这些节点的IP地址。
 
 ### 启动集群作业
-`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为```paddle.py``` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
+
+`paddle.py` 提供了自动化脚本来启动不同节点中的所有 PaddlePaddle 集群进程。默认情况下，所有命令行选项可以设置为 `paddle.py` 命令选项并且 `paddle.py` 将透明、自动地将这些选项应用到 PaddlePaddle 底层进程。
 
 `paddle.py` 为方便作业启动提供了两个独特的命令选项。
 
-`job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 conf.py 中设置的所有节点。  它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
-`job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
+-  `job_dispatch_package`  设为本地 `workspace` 目录，它将被分发到 `conf.py` 中设置的所有节点。它有助于帮助频繁修改和访问工作区文件的用户减少负担，否则频繁的多节点工作空间部署可能会很麻烦。
+-  `job_workspace`  设为已部署的工作空间目录，`paddle.py` 将跳过分发阶段直接启动所有节点的集群作业。它可以帮助减少分发延迟。
 
-`cluster_train/run.sh` 提供了命令样例来运行 `demo/recommendation` 集群工作，只需用你定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
+`cluster_train/run.sh` 提供了命令样例来运行 `doc/howto/usage/cluster/src/word2vec` 集群任务，只需用您定义的目录修改 `job_dispatch_package` 和 `job_workspace`，然后：
 ```
 sh run.sh
 ```
@@ -149,7 +229,7 @@ sh run.sh
 提供 pserver 运行日志，有助于诊断分布式错误。
 
 `server.log`
-提供 pserver 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
+提供 parameter server 进程的 stderr 和 stdout。训练失败时可以检查错误日志。
 
 `train.log`
 提供训练过程的 stderr 和 stdout。训练失败时可以检查错误日志。
@@ -157,3 +237,49 @@ sh run.sh
 ### 检查模型输出
 运行完成后，模型文件将被写入节点 0 的 `output` 目录中。
 工作空间中的 `nodefile` 表示当前集群作业的节点 ID。
+
+## 在OpenMPI集群中提交训练作业
+
+### 准备OpenMPI集群
+
+执行下面的命令以启动3个节点的OpenMPI集群和一个"head"节点：
+
+```bash
+paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+然后可以从head节点ssh无密码登录到OpenMPI的每个节点上。
+
+### 启动集群作业
+
+您可以按照下面的步骤在OpenMPI集群中提交paddle训练任务：
+
+```bash
+# 获得head和node节点的IP地址
+kubectl get po -o wide
+# 将node节点的IP地址保存到machines文件中
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# 拷贝必要的文件到head节点
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# ssh 登录到head节点
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- 以下操作均在head节点中执行 ---------------
+# 准备训练数据
+python prepare.py
+# 拷贝训练程序和字典文件到每台MPI节点
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# 创建日志目录
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# 拷贝训练数据到各自的节点
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# 启动训练任务
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
+
+## 在Kubernetes集群中提交训练作业
+
+此部分的使用方法可以参考[here](../k8s/k8s_distributed_cn.md)。
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index c60876721cbf5565d6e48c8061811aacada748cd..1e8b4d54b9ffa99b3beef35ecaf95bbd0866535f 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,129 +1,220 @@
-# Run Distributed Training
+# PaddlePaddle Distributed Training
+
+* [Introduction](#introduction)
+* [Preparations](#preparations)
+* [Command-line arguments](#command-line-arguments)
+   * [Starting parameter server](#starting-parameter-server)
+   * [Starting trainer](#starting-trainer)
+   * [Prepare Training Dataset](#prepare-training-dataset)
+   * [Prepare Training program](#prepare-training-program)
+* [Use cluster platforms or cluster management tools](#use-cluster-platforms-or-cluster-management-tools)
+   * [Cluster Training Using Fabric](#cluster-training-using-fabric)
+      * [Prepare a Linux cluster](#prepare-a-linux-cluster)
+      * [Launching Cluster Job](#launching-cluster-job)
+      * [Kill Cluster Job](#kill-cluster-job)
+      * [Check Cluster Training Result](#check-cluster-training-result)
+      * [Check Model Output](#check-model-output)
+   * [Cluster Training Using OpenMPI](#cluster-training-using-openmpi)
+      * [Prepare an OpenMPI cluster](#prepare-an-openmpi-cluster)
+      * [Launching Cluster Job](#launching-cluster-job-1)
+   * [Cluster Training Using Kubernetes](#cluster-training-using-kubernetes)
+
+# Introduction
+
+In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
+
+<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
+
+- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
+- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
+- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
+
+PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
+
+When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
+
+# Preparations
+1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
+2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/getstarted/build_and_install) document. We strongly recommend using [Docker installation](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_en.rst).
+
+After installation, you can check the version by typing the below command (run a docker container  if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
+
+```bash
+$ paddle version
+PaddlePaddle 0.10.0rc, compiled with
+    with_avx: ON
+    with_gpu: OFF
+    with_double: OFF
+    with_python: ON
+    with_rdma: OFF
+    with_timer: OFF
+```
 
-In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
+We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
+# Command-line arguments
 
-## Prerequisite
+## Starting parameter server
 
-1. Aforementioned scripts use a Python library [fabric](http://www.fabfile.org/) to run SSH commands.  We can use `pip` to install fabric:
+Type the below command to start a parameter server which will wait for trainers to connect:
 
-   ```bash
-   pip install fabric
-   ```
+```bash
+$ paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1
+```
 
-1. We need to install PaddlePaddle on all nodes in the cluster.  To enable GPUs, we need to install CUDA in `/usr/local/cuda`; otherwise Paddle would report errors at runtime.
+If you wish to run parameter servers in background, and save a log file, you can type:
+```bash
+$ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num_for_sparse=1 --num_gradient_servers=1 &> pserver.log
+```
 
-1. Set the `ROOT_DIR` variable in [`cluster_train/conf.py`] on all nodes.  For convenience, we often create a Unix user `paddle` on all nodes and set `ROOT_DIR=/home/paddle`.  In this way, we can write public SSH keys into `/home/paddle/.ssh/authorized_keys` so that user `paddle` can SSH to all nodes without password.
+| param  | required | default | description |
+| ------------- | ------------- | ------------- | ------------- |
+| port  | required | 7164 | port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput |
+| ports_num  | required | 1 | total number of ports will listen on  |
+| ports_num_for_sparse  | required | 1 | number of ports which serves sparse parameter update  |
+| num_gradient_servers  | required | 1 | total number of gradient servers |
 
-## Prepare Job Workspace
+## Starting trainer
+Type the command below to start the trainer(name the file whatever you want, like "train.py")
 
-We refer to the directory where we put dependent libraries, config files, etc., as *workspace*.
+```bash
+$ python train.py
+```
 
-These `train/test` data should be prepared before launching cluster job. To  satisfy the requirement that train/test data are placed in different directory from workspace, PADDLE refers train/test data according to index file named as `train.list/test.list` which are used in model config file. So the train/test data also contains train.list/test.list two list file. All local training demo already provides scripts to help you create these two files,  and all nodes in cluster job will handle files with same logical code in normal condition.
+Trainers' network need to be connected with parameter servers' network to finish the job. Trainers need to know port and IPs to locate parameter servers. You can pass arguments to trainers through [environment variables](https://en.wikipedia.org/wiki/Environment_variable) or pass to `paddle.init()` function. Arguments passed to the `paddle.init()` function will overwrite environment variables.
 
-Generally, you can use same model file from local training for cluster training. What you should have in mind that, the `batch_size` set in `setting` function in model file means batch size in `each` node of cluster job instead of total batch size if synchronization SGD was used.
+Use environment viriables:
 
-Following steps are based on [demo/recommendation](https://github.com/PaddlePaddle/Paddle/tree/develop/demo/recommendation) demo in demo directory.
+```bash
+export PADDLE_INIT_USE_GPU=False
+export PADDLE_INIT_TRAINER_COUNT=1
+export PADDLE_INIT_PORT=7164
+export PADDLE_INIT_PORTS_NUM=1
+export PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+export PADDLE_INIT_NUM_GRADIENT_SERVERS=1
+export PADDLE_INIT_TRAINER_ID=0
+export PADDLE_INIT_PSERVERS=127.0.0.1
+python train.py
+```
 
-You just go through demo/recommendation tutorial doc until `Train` section, and at last you will get train/test data and model configuration file. Finaly, just use demo/recommendation as workspace for cluster training.
+Pass arguments:
 
-At last your workspace should look like as follow:
+```python
+paddle.init(
+        use_gpu=False,
+        trainer_count=1,
+        port=7164,
+        ports_num=1,
+        ports_num_for_sparse=1,
+        num_gradient_servers=1,
+        trainer_id=0,
+        pservers="127.0.0.1")
 ```
-.
-|-- common_utils.py
-|-- data
-|   |-- config.json
-|   |-- config_generator.py
-|   |-- meta.bin
-|   |-- meta_config.json
-|   |-- meta_generator.py
-|   |-- ml-1m
-|   |-- ml_data.sh
-|   |-- ratings.dat.test
-|   |-- ratings.dat.train
-|   |-- split.py
-|   |-- test.list
-|   `-- train.list
-|-- dataprovider.py
-|-- evaluate.sh
-|-- prediction.py
-|-- preprocess.sh
-|-- requirements.txt
-|-- run.sh
-`-- trainer_config.py
+
+| param  | required | default | description |
+| ------------- | ------------- | ------------- | ------------- |
+| use_gpu  | optional | False | set to "True" to enable GPU training |
+| trainer_count  | required | 1 | total count of trainers in the training job |
+| port  | required | 7164 | port to connect to parameter server  |
+| ports_num  | required | 1 | number of ports for communication |
+| ports_num_for_sparse  | required | 1 | number of ports for sparse type caculation |
+| num_gradient_servers  | required | 1 | total number of gradient server |
+| trainer_id  | required | 0 | ID for every trainer, start from 0 |
+| pservers  | required | 127.0.0.1 | list of IPs of parameter servers, separated by "," |
+
+## Prepare Training Dataset
+
+Here's some example code [prepare.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py), it will download public `imikolov` dataset and split it into multiple files according to job parallelism(trainers count). Modify `SPLIT_COUNT` at the begining of `prepare.py` to change the count of output files.
+
+In the real world, we often use `MapReduce` job's output as training data, so there will be lots of files. You can use `mod` to assign training file to trainers:
+
+```python
+import os
+train_list = []
+flist = os.listdir("/train_data/")
+for f in flist:
+  suffix = int(f.split("-")[1])
+  if suffix % TRAINER_COUNT == TRAINER_ID:
+    train_list.append(f)
+```
+
+Example code `prepare.py` will split training data and testing data into 3 files with digital suffix like `-00000`, `-00001` and`-00002`:
+
+```
+train.txt
+train.txt-00000
+train.txt-00001
+train.txt-00002
+test.txt
+test.txt-00000
+test.txt-00001
+test.txt-00002
 ```
-Not all of these files are needed for cluster training, but it's not necessary to remove useless files.
 
-`trainer_config.py`
-Indicates the model config file.
+When job started, every trainer needs to get it's own part of data. In some distributed systems a storage service will be provided, so the date under that path can be accessed by all the trainer nodes. Without the storage service, you must copy the training data to each trainer node.
 
-`train.list` and `test.list`
-File index. It stores all relative or absolute file paths of all train/test data at current node.
+Different training jobs may have different data format and `reader()` function, developers may need to write different data prepare scripts and `reader()` functions for their job.
 
-`dataprovider.py`
-used to read train/test samples. It's same as local training.
+## Prepare Training program
 
-`data`
-all files in data directory are refered by train.list/test.list which are refered by data provider.
+We'll create a *workspace* directory on each node, storing your training program, dependencies, mounted or downloaded dataset directory.
 
 
-## Prepare Cluster Job Configuration
+Your workspace may looks like:
+```
+.
+|-- my_lib.py
+|-- word_dict.pickle
+|-- train.py
+|-- train_data_dir/
+|   |-- train.txt-00000
+|   |-- train.txt-00001
+|   |-- train.txt-00002
+`-- test_data_dir/
+    |-- test.txt-00000
+    |-- test.txt-00001
+    `-- test.txt-00002
+```
 
-The options below must be carefully set in cluster_train/conf.py
+- `my_lib.py`: user defined libraries, like PIL libs. This is optional.
+- `word_dict.pickle`: dict file for training word embeding.
+- `train.py`: training program. Sample code: [api_train_v2_cluster.py](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/src/word2vec/prepare.py). ***NOTE:*** You may need to modify the head part of `train.py` when using different cluster platform to retrive configuration environment variables:
 
-`HOSTS`  all nodes hostname or ip that will run cluster job. You can also append user and ssh port with hostname, such as root@192.168.100.17:9090.
+  ```python
+  cluster_train_file = "./train_data_dir/train/train.txt"
+  cluster_test_file = "./test_data_dir/test/test.txt"
+  node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+  if not node_id:
+      raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+  ```
 
-`ROOT_DIR` workspace ROOT directory for placing JOB workspace directory
+- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
+- `test_data_dir`: containing testing data.
 
-`PADDLE_NIC` the NIC(Network Interface Card) interface name for cluster communication channel, such as eth0 for ethternet, ib0 for infiniband.
+# Use cluster platforms or cluster management tools
 
-`PADDLE_PORT` port number for cluster commnunication channel
+PaddlePaddle supports running jobs on several platforms including:
+- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
+- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
+- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
 
-`PADDLE_PORTS_NUM` the number of port used for cluster communication channle. if the number of cluster nodes is small(less than 5~6nodes), recommend you set it to larger, such as 2 ~ 8, for better network performance.
+We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
 
-`PADDLE_PORTS_NUM_FOR_SPARSE` the number of port used for sparse updater cluster commnunication channel. if sparse remote update is used, set it like `PADDLE_PORTS_NUM`
+These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
 
-`LD_LIBRARY_PATH` set addtional LD_LIBRARY_PATH for cluster job. You can use it to set CUDA libraries path.
+## Cluster Training Using Fabric
 
-Default Configuration as follow:
+### Prepare a Linux cluster
 
-```python
-HOSTS = [
-        "root@192.168.100.17",
-        "root@192.168.100.18",
-        ]
-
-'''
-workspace configuration
-'''
-
-#root dir for workspace
-ROOT_DIR = "/home/paddle"
-
-'''
-network configuration
-'''
-#pserver nics
-PADDLE_NIC = "eth0"
-#pserver port
-PADDLE_PORT = 7164
-#pserver ports num
-PADDLE_PORTS_NUM = 2
-#pserver sparse ports num
-PADDLE_PORTS_NUM_FOR_SPARSE = 2
-
-#environments setting for all processes in cluster job
-LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/lib64"
-```
+Run `kubectl -f ssh_servers.yaml` under the directory:  `paddle/scripts/cluster_train_v2/fabric/docker_cluster` will launch a demo cluster. Run `kubectl get po -o wide` to get IP addresses of these nodes.
 
 ### Launching Cluster Job
-`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
+`paddle.py` provides automatical scripts to start all PaddlePaddle cluster processes in different nodes. By default, all command line options can be set as `paddle.py` command options and `paddle.py` will transparently and automatically set these options to PaddlePaddle lower level processes.
 
 `paddle.py`provides two distinguished command option for easy job launching.
 
-`job_dispatch_package`  set it with local `workspace`directory, it will be dispatched to all nodes set in conf.py. It could be helpful for frequent hacking workspace files, otherwise frequent mulit-nodes workspace deployment could make your crazy.
-`job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
+- `job_dispatch_package` set it with local `workspace` directory, it will be dispatched to all nodes which is set in `conf.py`. It could be helpful for frequently manipulating workspace files. otherwise, frequent multi-nodes workspace deployment is very annoying.
+- `job_workspace`  set it with already deployed workspace directory, `paddle.py` will skip dispatch stage to directly launch cluster job with all nodes. It could help to reduce heavy
 dispatch latency.
 
 `cluster_train/run.sh` provides command line sample to run `demo/recommendation` cluster job, just modify `job_dispatch_package` and `job_workspace` with your defined directory, then:
@@ -134,23 +225,69 @@ sh run.sh
 The cluster Job will start in several seconds.
 
 ### Kill Cluster Job
-`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should mannally kill job if program crashed.
+`paddle.py` can capture `Ctrl + C` SIGINT signal to automatically kill all processes launched by it. So just stop `paddle.py` to kill cluster job. You should manually kill the job if the program crashed.
 
 ### Check Cluster Training Result
 Check log in $workspace/log for details, each node owns same log structure.
 
 `paddle_trainer.INFO`
-It provides almost all interal output log for training,  same as local training. Check runtime model convergence here.
+It provides almost all internal output log for training,  same as local training. Check runtime model convergence here.
 
 `paddle_pserver2.INFO`
-It provides pserver running log, which could help to diagnose distributed error.
+It provides parameter server running log, which could help to diagnose distributed error.
 
 `server.log`
-It provides stderr and stdout of pserver process. Check error log if training crashs.
+It provides stderr and stdout of parameter server process. Check error log if training crashes.
 
 `train.log`
-It provides stderr and stdout of trainer process. Check error log if training crashs.
+It provides stderr and stdout of trainer process. Check error log if training crashes.
 
 ### Check Model Output
-After one pass finished, model files will be writed in `output` directory in node 0.
+After one pass finished, model files will be written in `output` directory in node 0.
 `nodefile` in workspace indicates the node id of current cluster job.
+
+## Cluster Training Using OpenMPI
+
+### Prepare an OpenMPI cluster
+
+Run the following command to start a 3-node MPI cluster and one "head" node.
+
+```bash
+cd paddle/scripts/cluster_train_v2/openmpi/docker_cluster
+kubectl create -f head.yaml
+kubectl create -f mpi-nodes.yaml
+```
+
+Then you can log in to every OpenMPI node using ssh without input any passwords.
+
+### Launching Cluster Job
+
+Follow the steps to launch a PaddlePaddle training job in OpenMPI cluster:\
+
+```bash
+# find out node IP addresses
+kubectl get po -o wide
+# generate a "machines" file containing node IP addresses
+kubectl get po -o wide | grep nodes | awk '{print $6}' > machines
+# copy necessary files onto "head" node
+scp -i ssh/id_rsa.mpi.pub machines prepare.py train.py start_mpi_train.sh tutorial@[headIP]:~
+# login to head node using ssh
+ssh -i ssh/id_rsa.mpi.pub tutorial@[headIP]
+# --------------- in head node ---------------
+# prepare training data
+python prepare.py
+# copy training data and dict file to MPI nodes
+cat machines | xargs -i scp word_dict.pickle train.py start_mpi_train.sh machines {}:/home/tutorial
+# creat a directory for storing log files
+mpirun -hostfile machines -n 3 mkdir /home/tutorial/logs
+# copy training data to every node
+scp train.txt-00000 test.txt-00000 [node1IP]:/home/tutorial
+scp train.txt-00001 test.txt-00001 [node2IP]:/home/tutorial
+scp train.txt-00002 test.txt-00002 [node3IP]:/home/tutorial
+# start the job
+mpirun -hostfile machines -n 3  /home/tutorial/start_mpi_train.sh
+```
+
+## Cluster Training Using Kubernetes
+
+The details can be found [here](../k8s/k8s_cn.md)
diff --git a/doc/howto/usage/cluster/src/trainer.png b/doc/howto/usage/cluster/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer.png differ
diff --git a/doc/howto/usage/cluster/src/trainer_cn.png b/doc/howto/usage/cluster/src/trainer_cn.png
new file mode 100644
index 0000000000000000000000000000000000000000..f9525739cc8bc6506adde642aafa0a85ae3ebebc
Binary files /dev/null and b/doc/howto/usage/cluster/src/trainer_cn.png differ
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0940f0e56eafa22f8aeb7052c0ddc79d8862917
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2.py
@@ -0,0 +1,100 @@
+import gzip
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def main():
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e6d8887124a5524505b097803a60a35478ca644
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/api_train_v2_cluster.py
@@ -0,0 +1,123 @@
+import math
+import os
+import paddle.v2 as paddle
+import pickle
+
+embsize = 32
+hiddensize = 256
+N = 5
+cluster_train_file = "./train_data_dir/train/train.txt"
+cluster_test_file = "./test_data_dir/test/test.txt"
+node_id = os.getenv("OMPI_COMM_WORLD_RANK")
+if not node_id:
+    raise EnvironmentError("must provied OMPI_COMM_WORLD_RANK")
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def cluster_reader_cluster(filename, node_id):
+    def cluster_reader():
+        with open("-".join([filename, "%05d" % int(node_id)]), "r") as f:
+            for l in f:
+                csv_data = [int(cell) for cell in l.split(",")]
+                yield tuple(csv_data)
+
+    return cluster_reader
+
+
+def main():
+    # get arguments from env
+
+    # for local training
+    TRUTH = ["true", "True", "TRUE", "1", "yes", "Yes", "YES"]
+    cluster_train = os.getenv('PADDLE_CLUSTER_TRAIN', "False") in TRUTH
+    use_gpu = os.getenv('PADDLE_INIT_USE_GPU', "False")
+
+    if not cluster_train:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")))
+    else:
+        paddle.init(
+            use_gpu=use_gpu,
+            trainer_count=int(os.getenv("PADDLE_INIT_TRAINER_COUNT", "1")),
+            port=int(os.getenv("PADDLE_INIT_PORT", "7164")),
+            ports_num=int(os.getenv("PADDLE_INIT_PORTS_NUM", "1")),
+            ports_num_for_sparse=int(
+                os.getenv("PADDLE_INIT_PORTS_NUM_FOR_SPARSE", "1")),
+            num_gradient_servers=int(
+                os.getenv("PADDLE_INIT_NUM_GRADIENT_SERVERS", "1")),
+            trainer_id=int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")),
+            pservers=os.getenv("PADDLE_INIT_PSERVERS", "127.0.0.1"))
+    fn = open("thirdparty/wuyi_train_thdpty/word_dict.pickle", "r")
+    word_dict = pickle.load(fn)
+    fn.close()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        cluster_reader_cluster(cluster_test_file, node_id), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(cluster_reader_cluster(cluster_train_file, node_id), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/howto/usage/cluster/src/word2vec/prepare.py b/doc/howto/usage/cluster/src/word2vec/prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..24f5c5b26d37ea03de3ab4dc2d967a4bd009eef0
--- /dev/null
+++ b/doc/howto/usage/cluster/src/word2vec/prepare.py
@@ -0,0 +1,41 @@
+import paddle.v2 as paddle
+import tarfile
+import os
+import pickle
+
+SPLIT_COUNT = 3
+N = 5
+
+
+def file_len(fd):
+    for i, l in enumerate(fd):
+        pass
+    return i + 1
+
+
+def split_from_reader_by_line(filename, reader, split_count):
+    fn = open(filename, "w")
+    for batch_id, batch_data in enumerate(reader()):
+        batch_data_str = [str(d) for d in batch_data]
+        fn.write(",".join(batch_data_str))
+        fn.write("\n")
+    fn.close()
+
+    fn = open(filename, "r")
+    total_line_count = file_len(fn)
+    fn.close()
+    per_file_lines = total_line_count / split_count + 1
+    cmd = "split -d -a 5 -l %d %s %s-" % (per_file_lines, filename, filename)
+    os.system(cmd)
+
+
+word_dict = paddle.dataset.imikolov.build_dict()
+with open("word_dict.pickle", "w") as dict_f:
+    pickle.dump(word_dict, dict_f)
+
+split_from_reader_by_line("train.txt",
+                          paddle.dataset.imikolov.train(word_dict, N),
+                          SPLIT_COUNT)
+split_from_reader_by_line("test.txt",
+                          paddle.dataset.imikolov.test(word_dict, N),
+                          SPLIT_COUNT)
diff --git a/doc/tutorials/image_classification/cifar.png b/doc/tutorials/image_classification/cifar.png
deleted file mode 100644
index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/cifar.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/image_classification.png b/doc/tutorials/image_classification/image_classification.png
deleted file mode 100644
index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/image_classification.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/index_cn.md b/doc/tutorials/image_classification/index_cn.md
deleted file mode 100644
index 87f465522a0fa21c8c03754b4be8dcb035c4de81..0000000000000000000000000000000000000000
--- a/doc/tutorials/image_classification/index_cn.md
+++ /dev/null
@@ -1,205 +0,0 @@
-图像分类教程
-==========
-
-在本教程中，我们将使用CIFAR-10数据集训练一个卷积神经网络，并使用这个神经网络来对图片进行分类。如下图所示，卷积神经网络可以辨识图片中的主体，并给出分类结果。
-<center>![Image Classification](./image_classification.png)</center>
-
-## 数据准备
-首先下载CIFAR-10数据集。下面是CIFAR-10数据集的官方网址：
-
-<https://www.cs.toronto.edu/~kriz/cifar.html>
-
-我们准备了一个脚本，可以用于从官方网站上下载CIFAR-10数据集，转为jpeg文件并存入特定的目录。使用这个脚本前请确认已经安装了pillow及相关依赖模块。可以参照下面的命令进行安装：
-
-1. 安装pillow
-
-```bash
-sudo apt-get install libjpeg-dev
-pip install pillow
-```
-
-2. 下载数据集
-
-```bash
-cd demo/image_classification/data/
-sh download_cifar.sh
-```
-
-CIFAR-10数据集包含60000张32x32的彩色图片。图片分为10类，每个类包含6000张。其中50000张图片作为训练集，10000张作为测试集。
-
-下图展示了所有的图片类别，每个类别中随机抽取了10张图片。
-<center>![Image Classification](./cifar.png)</center>
-
-脚本运行完成后，我们应当会得到一个名为cifar-out的文件夹，其下子文件夹的结构如下
-
-
-```
-train
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-test
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-```
-
-cifar-out下包含`train`和`test`两个文件夹，其中分别包含了CIFAR-10中的训练集和测试集。这两个文件夹下各自有10个子文件夹，每个子文件夹下存储相应分类的图片。将图片按照上述结构存储好之后，我们就可以着手对分类模型进行训练了。
-
-## 预处理
-数据下载之后，还需要进行预处理，将数据转换为Paddle的格式。我们可以通过如下命令进行预处理工作：
-
-```
-cd demo/image_classification/
-sh preprocess.sh
-```
-
-其中`preprocess.sh` 调用 `./demo/image_classification/preprocess.py` 对图片进行预处理
-```sh
-export PYTHONPATH=$PYTHONPATH:../../
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-```
-
-`./demo/image_classification/preprocess.py` 使用如下参数：
-
-- `-i` 或 `--input` 给出输入数据所在路径；
-- `-s` 或 `--size` 给出图片尺寸；
-- `-c` 或 `--color` 标示图片是彩色图或灰度图
-
-## 模型训练
-在开始训练之前，我们需要先创建一个模型配置文件。下面我们给出了一个配置示例。**注意**，这里的列出的和`vgg_16_cifar.py`文件稍有差别，因为该文件可适用于预测。
-
-```python
-from paddle.trainer_config_helpers import *
-data_dir='data/cifar-out/batches/'
-meta_path=data_dir+'batches.meta'
-args = {'meta':meta_path, 'mean_img_size': 32,
-        'img_size': 32, 'num_classes': 10,
-        'use_jpeg': 1, 'color': "color"}
-define_py_data_sources2(train_list=data_dir+"train.list",
-                        test_list=data_dir+'test.list',
-                        module='image_provider',
-                        obj='processData',
-                        args=args)
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128))
-
-img = data_layer(name='image', size=3*32*32)
-lbl = data_layer(name="label", size=10)
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-outputs(classification_cost(input=predict, label=lbl))
-```
-
-在第一行中我们载入用于定义网络的函数。
-```python
-from paddle.trainer_config_helpers import *
-```
-
-之后定义的`define_py_data_sources2`使用Python数据提供器，其中 `args`将在`image_provider.py`进行使用，该文件负责产生图片数据并传递给Paddle系统
- - `meta`: 训练集平均值。
- - `mean_img_size`: 平均特征图的高度及宽度。
- - `img_size`：输入图片的高度及宽度。
- - `num_classes`：类别个数。
- - `use_jpeg`：处理过程中数据存储格式。
- - `color`：标示是否为彩色图片。
- 
- `settings`用于设置训练算法。在下面的例子中，learning rate被设置为0.1除以batch size，而weight decay则为0.0005乘以batch size。
- 
- ```python
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
-```
-
-`small_vgg`定义了网络结构。这里我们使用的是一个小的VGG网络。关于VGG卷积神经网络的描述可以参考：[http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/)。
-```python
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-```
-配置创建完毕后，可以运行脚本train.sh来训练模型。
-
-```bash
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
-```
-- 这里我们使用的是GPU模式进行训练。如果你没有GPU环境，可以设置`use_gpu=0`。
-- `./demo/image_classification/vgg_16_cifar.py`是网络和数据配置文件。各项参数的详细说明可以在命令行参数相关文档中找到。
-- 脚本`plotcurve.py`依赖于python的`matplotlib`模块。因此如果这个脚本运行失败，也许是因为需要安装`matplotlib`。
-在训练完成后，训练及测试误差曲线图会被`plotcurve.py`脚本保存在 `plot.png`中。下面是一个误差曲线图的示例：
-
-<center>![Training and testing curves.](./plot.png)</center>
-
-## 预测
-在训练完成后，模型及参数会被保存在路径`./cifar_vgg_model/pass-%05d`下。例如第300个pass的模型会被保存在`./cifar_vgg_model/pass-00299`。
-
-要对一个图片的进行分类预测，我们可以使用`predict.sh`，该脚本将输出预测分类的标签：
-
-```
-sh predict.sh
-```
-
-predict.sh:
-```
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
-```
-
-## 练习
-在CUB-200数据集上使用VGG模型训练一个鸟类图片分类模型。相关的鸟类数据集可以从如下地址下载，其中包含了200种鸟类的照片（主要来自北美洲）。
-
-<http://www.vision.caltech.edu/visipedia/CUB-200.html>
-
-
-
-
-## 细节探究
-### 卷积神经网络
-卷积神经网络是一种使用卷积层的前向神经网络，很适合构建用于理解图片内容的模型。一个典型的神经网络如下图所示：
-
-![Convolutional Neural Network](./lenet.png)
-
-一个卷积神经网络包含如下层：
-
-- 卷积层：通过卷积操作从图片或特征图中提取特征
-- 池化层：使用max-pooling对特征图下采样
-- 全连接层：使输入层到隐藏层的神经元是全部连接的。
-
-卷积神经网络在图片分类上有着惊人的性能，这是因为它发掘出了图片的两类重要信息：局部关联性质和空间不变性质。通过交替使用卷积和池化处理， 卷积神经网络能够很好的表示这两类信息。
-
-关于如何定义网络中的层，以及如何在层之间进行连接，请参考Layer文档。
diff --git a/doc/tutorials/image_classification/index_en.md b/doc/tutorials/image_classification/index_en.md
deleted file mode 100644
index 60c81a6a539944634773f38ec4c9a59709dd4afc..0000000000000000000000000000000000000000
--- a/doc/tutorials/image_classification/index_en.md
+++ /dev/null
@@ -1,221 +0,0 @@
-Image Classification Tutorial
-==============================
-
-This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset.
-As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result.
-
-<center>![Image Classification](./image_classification.png)</center>
-
-## Data Preparation
-First, download CIFAR-10 dataset. CIFAR-10 dataset can be downloaded from its official website.
-
-<https://www.cs.toronto.edu/~kriz/cifar.html>
-
-We have prepared a script to download and process CIFAR-10 dataset. The script will download CIFAR-10 dataset from the official dataset.
-It will convert it to jpeg images and organize them into a directory with the required structure for the tutorial. Make sure that you have installed pillow and its dependents.
-Consider the following commands:
-
-1. install pillow dependents
-
-```bash
-sudo apt-get install libjpeg-dev
-pip install pillow
-```
-
-2. download data and preparation
-
-```bash
-cd demo/image_classification/data/
-sh download_cifar.sh
-```
-
-The CIFAR-10 dataset consists of 60000 32x32 color images in 10 classes, with 6000 images per class. There are 50000 training images and 10000 test images.
-
-Here are the classes in the dataset, as well as 10 random images from each:
-<center>![Image Classification](./cifar.png)</center>
-
-
-After downloading and converting, we should find a directory (cifar-out) containing the dataset in the following format:
-
-```
-train
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-test
----airplane
----automobile
----bird
----cat
----deer
----dog
----frog
----horse
----ship
----truck
-```
-
-It has two directories:`train` and `test`. These two directories contain training data and testing data of CIFAR-10, respectively. Each of these two folders contains 10 sub-folders, ranging from `airplane` to `truck`. Each sub-folder contains images with the corresponding label. After the images are organized into this structure, we are ready to train an image classification model.
-
-## Preprocess
-After the data has been downloaded, it needs to be pre-processed into the Paddle format. We can run the following command for preprocessing.
-
-```
-cd demo/image_classification/
-sh preprocess.sh
-```
-
-`preprocess.sh` calls `./demo/image_classification/preprocess.py` to preprocess image data.
-```sh
-export PYTHONPATH=$PYTHONPATH:../../
-data_dir=./data/cifar-out
-python preprocess.py -i $data_dir -s 32 -c 1
-```
-
-`./demo/image_classification/preprocess.py` has the following arguments
-
-- `-i` or `--input` specifes  the input data directory.
-- `-s` or `--size` specifies the processed size of images.
-- `-c` or `--color` specifes whether images are color images or gray images.
-
-
-## Model Training
-We need to create a model config file before training the model. An example of the config file (vgg_16_cifar.py) is listed below. **Note**, it is slightly different from the `vgg_16_cifar.py` which also applies to the prediction.
-
-```python
-from paddle.trainer_config_helpers import *
-data_dir='data/cifar-out/batches/'
-meta_path=data_dir+'batches.meta'
-args = {'meta':meta_path, 'mean_img_size': 32,
-        'img_size': 32, 'num_classes': 10,
-        'use_jpeg': 1, 'color': "color"}
-define_py_data_sources2(train_list=data_dir+"train.list",
-                        test_list=data_dir+'test.list',
-                        module='image_provider',
-                        obj='processData',
-                        args=args)
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128))
-
-img = data_layer(name='image', size=3*32*32)
-lbl = data_layer(name="label", size=10)
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-outputs(classification_cost(input=predict, label=lbl))
-```
-
-The first line imports python functions for defining networks.
-```python
-from paddle.trainer_config_helpers import *
-```
-
-Then define an `define_py_data_sources2` which use python data provider
-interface. The arguments in `args` are used in `image_provider.py` which
-yeilds image data and transform them to Paddle.
- - `meta`: the mean value of training set.
- - `mean_img_size`: the size of mean feature map.
- - `img_size`: the height and width of input image.
- - `num_classes`: the number of classes.
- - `use_jpeg`: the data storage type when preprocessing.
- - `color`: specify color image.
-
-`settings` specifies the training algorithm. In the following example,
-it specifies learning rate as 0.1, but divided by batch size, and the weight decay
-is 0.0005 and multiplied by batch size.
-```python
-settings(
-    batch_size = 128,
-    learning_rate = 0.1 / 128.0,
-    learning_method = MomentumOptimizer(0.9),
-    regularization = L2Regularization(0.0005 * 128)
-)
-```
-
-The `small_vgg` specifies the network. We use a small version of VGG convolutional network as our network
-for classification. A description of VGG network can be found here [http://www.robots.ox.ac.uk/~vgg/research/very_deep/](http://www.robots.ox.ac.uk/~vgg/research/very_deep/).
-```python
-# small_vgg is predined in trainer_config_helpers.network
-predict = small_vgg(input_image=img, num_channels=3)
-```
-After writing the config, we can train the model by running the script train.sh.
-
-```bash
-config=vgg_16_cifar.py
-output=./cifar_vgg_model
-log=train.log
-
-paddle train \
---config=$config \
---dot_period=10 \
---log_period=100 \
---test_all_data_in_one_period=1 \
---use_gpu=1 \
---save_dir=$output \
-2>&1 | tee $log
-
-python -m paddle.utils.plotcurve -i $log > plot.png
-```
-
-- Here we use GPU mode to train. If you have no gpu environment, just set `use_gpu=0`.
-
-- `./demo/image_classification/vgg_16_cifar.py` is the network and data configuration file. The meaning of the other flags can be found in the documentation of the command line flags.
-
-- The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`.
-
-
-After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
-
-<center>![Training and testing curves.](./plot.png)</center>
-
-
-## Prediction
-After we train the model, the model file as well as the model parameters are stored in path `./cifar_vgg_model/pass-%05d`. For example, the model of the 300-th pass is stored at `./cifar_vgg_model/pass-00299`.
-
-To make a prediction for an image, one can run `predict.sh` as follows. The script will output the label of the classfiication.
-
-```
-sh predict.sh
-```
-
-predict.sh:
-```
-model=cifar_vgg_model/pass-00299/
-image=data/cifar-out/test/airplane/seaplane_s_000978.png
-use_gpu=1
-python prediction.py $model $image $use_gpu
-```
-
-## Exercise
-Train a image classification of birds using VGG model and CUB-200 dataset. The birds dataset can be downloaded here. It contains an image dataset with photos of 200 bird species (mostly North American).
-
-<http://www.vision.caltech.edu/visipedia/CUB-200.html>
-
-
-
-
-## Delve into Details
-### Convolutional Neural Network
-A Convolutional Neural Network is a feedforward neural network that uses convolution layers. It is very suitable for building neural networks that process and understand images. A standard convolutional neural network is shown below:
-
-![Convolutional Neural Network](./lenet.png)
-
-Convolutional Neural Network contains the following layers:
-
-- Convolutional layer: It uses convolution operation to extract features from an image or a feature map.
-- Pooling layer: It uses max-pooling to downsample feature maps.
-- Fully Connected layer: It uses fully connected connections to transform features.
-
-Convolutional Neural Network achieves amazing performance for image classification because it exploits two important characteristics of images: *local correlation* and *spatial invariance*. By iteratively applying convolution and max-pooing operations, convolutional neural network can well represent these two characteristics of images.
-
-
-For more details of how to define layers and their connections, please refer to the documentation of layers.
diff --git a/doc/tutorials/image_classification/lenet.png b/doc/tutorials/image_classification/lenet.png
deleted file mode 100644
index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/lenet.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/plot.png b/doc/tutorials/image_classification/plot.png
deleted file mode 100644
index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/plot.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png
deleted file mode 100644
index f54a0c58837cb3385b32dc57d02cec92666ef0f1..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/src/cifar.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png
deleted file mode 100644
index 14f255805081c1b4fab27eaf336fd389fa93ca19..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/src/image_classification.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png
deleted file mode 100644
index 1e6f2b32bad797f3fccb929c72a121fc935b0cbb..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/src/lenet.png and /dev/null differ
diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png
deleted file mode 100644
index a31f99791c670e18bb8c62b7604ec8cb0284ffb4..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/image_classification/src/plot.png and /dev/null differ
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
deleted file mode 100644
index 6a27004d58d24cc466d930322be8cdbb2f434c74..0000000000000000000000000000000000000000
--- a/doc/tutorials/index_cn.md
+++ /dev/null
@@ -1,13 +0,0 @@
-# 完整教程
-
-* [快速入门](quick_start/index_cn.rst)
-* [个性化推荐](rec/ml_regression_cn.rst)
-* [图像分类](image_classification/index_cn.md)
-* [情感分析](sentiment_analysis/index_cn.md)
-* [语义角色标注](semantic_role_labeling/index_cn.md)
-* [机器翻译](text_generation/index_cn.md)
-
-## 常用模型
-
-* [ResNet模型](imagenet_model/resnet_model_cn.md)
-* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
deleted file mode 100644
index 77331a703b6f0fdf92921ebcc476325b7327e976..0000000000000000000000000000000000000000
--- a/doc/tutorials/index_en.md
+++ /dev/null
@@ -1,14 +0,0 @@
-# TUTORIALS
-There are several examples and demos here.
-
-* [Quick Start](quick_start/index_en.md)
-* [MovieLens Regression](rec/ml_regression_en.rst)
-* [Image Classification](image_classification/index_en.md)
-* [Sentiment Analysis](sentiment_analysis/index_en.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_en.md)
-* [Text Generation](text_generation/index_en.md)
-* [Image Auto-Generation](gan/index_en.md)
-
-## Model Zoo
-* [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
-* [Embedding: Chinese Word](embedding_model/index_en.md)
diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md
deleted file mode 100644
index 2207a776f0774e72aba15169e59258dd04583637..0000000000000000000000000000000000000000
--- a/doc/tutorials/rec/ml_dataset_cn.md
+++ /dev/null
@@ -1,105 +0,0 @@
-```eval_rst
-.. _demo_ml_dataset:
-
-```
-
-# MovieLens数据集
-
-[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
-该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模，该数据及有很多不同的版本。
-我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
-集，其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
-
-## 数据集特征
-
-在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
-(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件，
-分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
-
-### 评分文件描述(ratings.dat)
-
-
-所有的评分数据都包含在"ratings.dat"文件中，遵循如下的格式:
-
-用户ID::电影ID::评分::时间戳
-
-- 用户ID范围从1到6040
-- 电影ID范围从1到3952
-- 评分被调整为5星的规模(只允许整数的星级)
-- 时间戳表示为从1970-01-01(UTC)来的秒数，与time(2)的返回值一致
-- 每位用户至少有20条评分
-
-### 用户文件描述(users.dat)
-
-所有的用户信息都包含在"users.dat"文件中，遵循如下的格式:
-
-用户ID::性别::年龄::职业::邮编
-
-所有的人口统计学信息由用户自愿提供，没有进行正确性的检查。只有含有人
-口统计学信息的用户才被包含在数据集中。
-
-- 性别，用"M"表示男性，"F"表示女性
-- 年龄从下列列表范围中选取:
-
-	*   1:	"18岁以下"
-	*  18:	"18-24岁"
-	*  25:	"25-34岁"
-	*  35:	"35-44岁"
-	*  45:	"45-49岁"
-	*  50:	"50-55岁"
-	*  56:	"56+"
-
-- 职业从下面所列中选择:
-
-	*   0:  "其他"或不确定
-	*   1:  "学术/教育工作者"
-	*   2:  "艺术家"
-	*   3:  "文书工作/管理员"
-	*   4:  "大学生/研究生"
-	*   5:  "客户服务"
-	*   6:  "医生/医疗保健"
-	*   7:  "行政工作/管理人员"
-	*   8:  "农民"
-	*   9:  "操持家务者"
-	*  10:  "高中毕业生"
-	*  11:  "律师"
-	*  12:  "程序员"
-	*  13:  "退休人员"
-	*  14:  "销售/市场"
-	*  15:  "科学家"
-	*  16:  "自由职业者"
-	*  17:  "技术员/工程师"
-	*  18:  "推销员/手工艺者"
-	*  19:  "无业人士"
-	*  20:  "作家"
-
-### 电影文件描述(movies.dat)
-
-所有的电影信息都包含在"movies.dat"文件中，遵循如下的格式:
-
-电影ID::电影名称::电影类型
-
-- 电影名称（包括发行时间）与IMDB网站提供的一致
-- 电影类型如符合多种用管道符号|分割，选自下列类型:
-
-	*	动作片
-	*	冒险片
-	*	动画片
-	*	儿童片
-	*	喜剧片
-	*	犯罪片
-	*	纪录片
-	*	戏剧
-	*	奇幻片
-	*	黑色电影
-	*	恐怖片
-	*	音乐剧
-	*	悬疑片
-	*	浪漫片
-	*	科幻片
-	*	惊险电影
-	*	战争片
-	*	西部片
-
-- 由于意外的副本记录和测试记录，有些电影ID可能与实际电影不相符合
-- 电影大部分是手工输入数据，因此可能会有一些错误和不一致发生
diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md
deleted file mode 100644
index 25dea5c4afbf1ce1c1ac6195cbd245b116459e2e..0000000000000000000000000000000000000000
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ /dev/null
@@ -1,111 +0,0 @@
-```eval_rst
-..  _demo_ml_dataset:
-```
-
-# MovieLens Dataset
-
-The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research.
-The data set contains some user information, movie information, and many movie ratings from \[1-5\].
-The data sets have many version depending on the size of set.
-We use [MovieLens 1M Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip) as a demo dataset, which contains
-1 million ratings from 6000 users on 4000 movies. Released 2/2003.
-
-## Dataset Features
-
-In [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip), there are many features in these dataset.
-The data files (which have ".dat" extension) in [ml-1m Dataset](http://files.grouplens.org/datasets/movielens/ml-1m.zip)
-is basically CSV file that delimiter is "::". The description in README we quote here.
-
-### RATINGS FILE DESCRIPTION(ratings.dat)
-
-
-All ratings are contained in the file "ratings.dat" and are in the
-following format:
-
-UserID::MovieID::Rating::Timestamp
-
-- UserIDs range between 1 and 6040
-- MovieIDs range between 1 and 3952
-- Ratings are made on a 5-star scale (whole-star ratings only)
-- Timestamp is represented in seconds since the epoch as returned by time(2)
-- Each user has at least 20 ratings
-
-### USERS FILE DESCRIPTION(users.dat)
-
-User information is in the file "users.dat" and is in the following
-format:
-
-UserID::Gender::Age::Occupation::Zip-code
-
-All demographic information is provided voluntarily by the users and is
-not checked for accuracy.  Only users who have provided some demographic
-information are included in this data set.
-
-- Gender is denoted by a "M" for male and "F" for female
-- Age is chosen from the following ranges:
-
-	*  1:  "Under 18"
-	* 18:  "18-24"
-	* 25:  "25-34"
-	* 35:  "35-44"
-	* 45:  "45-49"
-	* 50:  "50-55"
-	* 56:  "56+"
-
-- Occupation is chosen from the following choices:
-
-	*  0:  "other" or not specified
-	*  1:  "academic/educator"
-	*  2:  "artist"
-	*  3:  "clerical/admin"
-	*  4:  "college/grad student"
-	*  5:  "customer service"
-	*  6:  "doctor/health care"
-	*  7:  "executive/managerial"
-	*  8:  "farmer"
-	*  9:  "homemaker"
-	* 10:  "K-12 student"
-	* 11:  "lawyer"
-	* 12:  "programmer"
-	* 13:  "retired"
-	* 14:  "sales/marketing"
-	* 15:  "scientist"
-	* 16:  "self-employed"
-	* 17:  "technician/engineer"
-	* 18:  "tradesman/craftsman"
-	* 19:  "unemployed"
-	* 20:  "writer"
-
-### MOVIES FILE DESCRIPTION(movies.dat)
-
-Movie information is in the file "movies.dat" and is in the following
-format:
-
-MovieID::Title::Genres
-
-- Titles are identical to titles provided by the IMDB (including
-year of release)
-- Genres are pipe-separated and are selected from the following genres:
-
-	* Action
-	* Adventure
-	* Animation
-	* Children's
-	* Comedy
-	* Crime
-	* Documentary
-	* Drama
-	* Fantasy
-	* Film-Noir
-	* Horror
-	* Musical
-	* Mystery
-	* Romance
-	* Sci-Fi
-	* Thriller
-	* War
-	* Western
-
-- Some MovieIDs do not correspond to a movie due to accidental duplicate
-entries and/or test entries
-- Movies are mostly entered by hand, so errors and inconsistencies may exist
diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst
deleted file mode 100644
index 9278c9f603b648099f448963bc2246b8dc014ab7..0000000000000000000000000000000000000000
--- a/doc/tutorials/rec/ml_regression_cn.rst
+++ /dev/null
@@ -1,349 +0,0 @@
-MovieLens数据集评分回归模型
-===========================
-
-这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
-该示例将展示paddle如何进行词向量嵌入，处理相似度回归，针对文本
-的单词级别的卷积神经网络，以及paddle如何处理多种类型的输入。
-需要注意的是，该模型网络只是用于进行demo展示paddle如何工作，而
-没有进行结构的微调。
-
-
-**我们非常欢迎您用PADDLEPADDLE构建更好的示例，如果您有好的建议来
-让这个示例变得更好，希望能让我们知晓。**
-
-数据准备
-`````````
-下载并解压数据集
-'''''''''''''''''
-这里我们使用 :ref:`demo_ml_dataset` 。
-要下载和解压数据集，只需要简单的运行下面的命令即可。
-
-.. code-block:: bash
-
-	cd demo/recommendation/data
-	./ml_data.sh
-
-:code:`demo/recommendation/data/ml-1m` 的目录结构为:
-
-.. code-block:: text
-
-	+--ml-1m
-		+--- movies.dat 	# 电影特征
-		+--- ratings.dat 	# 评分
-		+--- users.dat 		# 用户特征
-		+--- README 		# 数据集描述
-
-字段配置文件
-'''''''''''''
-**字段配置文件** 用来具体说明数据集的字段和文件格式，
-例如，说明每个特征文件具体字段是 **什么** 类型。
-
-ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
-其具体说明了字段类型和文件名称:
-
-1) 用户文件中有四种类型的字段\: 编号，性别，年龄和职业；
-
-2) 文件名称为"users.dat"，文件的分隔符为"::"。
-
-.. include:: ../../../demo/recommendation/data/config.json
-   :code: json
-   :literal:
-
-准备数据
-`````````
-你需要安装python的第三方库。
-**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
-
-.. code-block:: bash
-
-	pip install -r requirements.txt
-
-预处理数据一般的命令为:
-
-.. code-block:: bash
-
-	cd demo/recommendation
-	./preprocess.sh
-
-下面介绍预处理过程具体的步骤。
-
-提取电影或用户的特征并生成python对象
-'''''''''''''''''''''''''''''''''''''
-
-在movielens 1m数据集中，电影和用户有许多的特征。
-评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
-我们首先处理电影或用户的特征文件，然后用pickle命令将特征( **Meta** )对象存储为文件。
-
-Meta配置文件
-.............
-
-**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
-该文件可以从字段配置文件生成，或是手动编辑生成。文件的格式可以
-为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
-
-要将字段配置文件转化为meta配置文件，只需要运行：
-
-.. code-block:: bash
-
-	cd demo/recommendation/data
-	python config_generator.py config.json > meta_config.json
-
-生成的meta配置文件如下所示：
-
-.. include:: ../../../demo/recommendation/data/meta_config.json
-	:code: json
-	:literal:
-
-在meta文件中有两种特征\: 电影和用户。
-
-* 在电影文件movies.dat中
-	* 我们仅用"::"来分隔每一行
-	* pos 0 代表编号
-	* pos 1 特征：
-		* name是电影名
-		* 利用正则表达式来解析该特征
-		* 基于字母的词嵌入特征
-		* 是序列
-	* pos 2 特征：
-		* name是体裁
-		* type是one hot稠密向量
-		* dictionary由解析自动生成，每一个key由'|'分隔
-* 在用户文件users.dat中
-	* 我们仅用"::"来分隔每一行
-	* pos 0 代表编号
-	* pos 1 特征：
-		* name是性别
-		* 简单的基于字母的词嵌入
-	* pos 2 特征：
-		* name是年龄
-		* 是整个的词嵌入
-		* 嵌入编号会根据单词排序
-	* pos 3 特征：
-		* name是职业
-		* 简单的整个词嵌入
-
-
-Meta文件
-''''''''
-
-有了meta配置文件之后，我们可以生成 **Meta文件** ，该文件是python的pickle对象，
-存储着电影或用户信息。可以运行下面的命令来生成。
-
-.. code-block:: bash
-
-	python meta_generator.py ml-1m meta.bin --config=meta_config.json
-
-meta文件 :code:`meta.bin` 的结构如下：
-
-.. code-block:: text
-
-    +--+ movie
-    |      +--+ __meta__
-    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
-    |      |       |       +
-    |      |       |       |     # 编号字段，我们用编号作为key 
-    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
-    |      |       |       |
-    |      |       |       |     # 电影名字段，嵌入特征字典
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
-    |      |       |       |
-    |      |       |       |     # 体裁字段，体裁字典
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
-    |      |       |
-    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
-    |      |                               # it means there are 2 features for each key.
-    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
-    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
-    |      |
-    |      +--+ 1 # 电影1的特征
-    |      |    +
-    |      |    +---+ [[...], [...]] # title ids, genres dense vector
-    |      |
-    |      +--+ 2
-    |      |
-    |      +--+ ...
-    |
-    +--- user
-           +--+ __meta__
-           |       +
-           |       +--+ raw_meta
-           |       |       +
-           |       |       +--+ id field as user
-           |       |       |
-           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
-           |       |
-           |       +--+ feature_map [1, 2, 3]
-           |
-           +--+ 1 # 用户1的特征
-           |
-           +--+ 2
-           +--+ ...
-
-
-分割训练/测试文件
-''''''''''''''''''
-
-我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是：对于每位用户，我们将评分分成两部分。
-这样的话每位用户在测试文件中将与训练文件含有同样的信息。
-
-用 :code:`separate.py` 来分离训练和测试文件。
-
-.. code-block:: bash
-
-	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
-
-这样就会生成两个文件：:code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
-将他们移动到目录 :code:`data` ，然后进行随机打乱，再为paddle的训练过程提供文件列表。
-
-..  code-block:: bash
-
-    shuf ml-1m/ratings.dat.train > ratings.dat.train
-    cp ml-1m/ratings.dat.test .
-    echo "./data/ratings.dat.train" > train.list
-    echo "./data/ratings.dat.test" > test.list
-
-
-神经网络结构配置
-`````````````````
-
-训练器配置文件
-'''''''''''''''
-
-网络结构如下图所示：
-
-..  image:: rec_regression_network.png
-    :align: center
-    :alt: rec_regression_network
-
-该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示：
-
-..  literalinclude:: ../../../demo/recommendation/trainer_config.py
-    :language: python
-    :lines: 15-
-
-在文件 :code:`trainer_config.py` 中，我们仅仅是将每个特征种类映射到一个特征向量中，以下
-展示了如何将每个特征映射到一个向量。
-
-* :code:`id` \: 仅仅是简单的嵌入，然后添加一个全连接层。
-* :code:`embedding` \:
-    - 如果是序列，则先做嵌入，然后再做一次文本卷积网络操作，
-      然后得到平均采样的结果。
-    - 如果不是序列，则先做嵌入，然后添加一个全连接层。
-* :code:`one_host_dense` \:
-    - 仅仅是两个全连接层。
-
-然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征，
-并且对用户的特征做同样的操作，也得到一个用户特征。然后我们求这两个特征的余弦相似度。
-
-在这些网络中，我们用以下的一些:ref:`api_trainer_config` 中的接口。
-
-*  数据层， :ref:`api_trainer_config_helpers_layers_data_layer`
-*  全连接层， :ref:`api_trainer_config_helpers_layers_fc_layer`
-*  嵌入层， :ref:`api_trainer_config_helpers_layers_embedding_layer`
-*  文本投影层， :ref:`api_trainer_config_helpers_layers_context_projection`
-*  采样层， :ref:`api_trainer_config_helpers_layers_pooling_layer`
-*  余弦相似度层， :ref:`api_trainer_config_helpers_layers_cos_sim`
-*  文本卷积采样层， :ref:`api_trainer_config_helpers_network_text_conv_pool`
-*  声明Python数据源， :ref:`api_trainer_config_helpers_data_sources` 
-
-数据提供脚本
-'''''''''''''
-
-..  literalinclude:: ../../../demo/recommendation/dataprovider.py
-    :language: python
-    :lines: 15-
-
-数据提供脚本仅仅是读取meta.bin和评分文件，生成训练需要的样本。
-在脚本 :code:`dataprovider.py` 中，我们需要设置：
-
-* obj.slots\: 特征的类型和维度。
-* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
-* process\: 返回数据的每一条样本给 :code:`paddle` 。
-
-数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider2` 。
-
-训练
-````
-
-准备好数据，配置了网络，编写好数据提供脚本后，现在我们可以开始paddle训练了。
-
-代码 :code:`run.sh` 如下：
-
-..  literalinclude:: ../../../demo/recommendation/run.sh
-    :language: bash
-    :lines: 16-
-
-该脚本仅仅是开始一个paddle训练过程，将日志写入文件 :code:`log.txt` ，然后
-打印在屏幕上。
-
-脚本 :code:`run.sh` 中的每一行命令，请参考页面 :ref:`cmd_line_index` 。
-这些参数的简短介绍如下：
-
-*  config\: 告诉paddle哪个文件是神经网络的配置文件。
-*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
-*  use_gpu\: 是否使用GPU，默认为不使用。
-*  trainer_count\: 一台机器上面的线程数量。
-*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则，
-   每个测试周期测试: code:`batch_size` 批次的数据。
-*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
-*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
-*  num_passes\: 训练至多: code:`num_passes` 轮。
-
-如果训练过程启动成功的话，输出应该类似如下：
-
-..  code-block:: text
-
-    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
-
-    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
-
-    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
-
-    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
-
-    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
-    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
-    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
-    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
-    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
-    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
-    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
-
-模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
-
-模型评估和预测
-```````````````
-
-在训练了几个轮次以后，你可以对模型进行评估，得到最好轮次下的模型。运行下面命令即可：
-
-.. code-block:: bash
-
-    ./evaluate.sh 
-
-你将看到如下的信息：
-
-.. code-block:: text
-
-    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
-    evaluating from pass output/pass-00009
-
-然后，你可以预测任何用户对于任何一部电影的评价，运行下面命令即可：
-
-..  code-block:: bash
-
-    python prediction.py 'output/pass-00009/'
-
-预测程序将读取用户的输入，然后输出预测分数。用户预测的命令行界面如下：
-
-..  code-block:: text
-
-    Input movie_id: 9
-    Input user_id: 4
-    Prediction Score is 2.56
-    Input movie_id: 8
-    Input user_id: 2
-    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst
deleted file mode 100644
index 993b9a516f134ff8b59e8755b721f76c8f32f0fd..0000000000000000000000000000000000000000
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ /dev/null
@@ -1,348 +0,0 @@
-Regression MovieLens Ratting
-============================
-
-Here we demonstrate a **Cosine Similarity Regression** job in movie lens dataset.
-This demo will show how paddle does (word) embedding job,
-handles the similarity regression,
-the character-level convolutional networks for text, and how does paddle handle
-multiple types of inputs.
-Note that the model structure is not fine-tuned and just a demo to show how paddle works.
-
-
-YOU ARE WELCOME TO BUILD A BETTER DEMO
-BY USING PADDLEPADDLE, AND LET US KNOW TO MAKE THIS DEMO BETTER.
-
-Data Preparation
-````````````````
-Download and extract dataset
-''''''''''''''''''''''''''''
-We use :ref:`demo_ml_dataset` here. 
-To download and unzip the dataset, simply run the following commands.
-
-..  code-block:: bash
-
-    cd demo/recommendation/data 
-    ./ml_data.sh
-
-And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
-
-..  code-block:: text
-
-    +--ml-1m
-         +--- movies.dat    # movie features
-         +--- ratings.dat   # ratings
-         +--- users.dat     # user features
-         +--- README        # dataset description
-
-Field config file
-'''''''''''''''''
-**Field config file** is used to specify the fields of the dataset and the file format,
-i.e, specific **WHAT** type it is in each feature file.
-
-The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
-It specifics the field types and file names: 1) there are four types of field for user file\: id, gender, age and occupation;
-2) the filename is "users.dat", and the delimiter of file is "::".
-
-..  include:: ../../../demo/recommendation/data/config.json
-    :code: json
-    :literal:
-
-Preprocess Data
-```````````````
-You need to install python 3rd party libraries.
-IT IS HIGHLY RECOMMEND TO USE VIRTUALENV MAKE A CLEAN PYTHON ENVIRONMENT.
-
-..  code-block:: bash
-
-    pip install -r requirements.txt
-
-The general command for preprocessing the dataset is:
-
-..  code-block:: bash
-
-    cd demo/recommendation
-    ./preprocess.sh
-    
-And the detail steps are introduced as follows.
-
-Extract Movie/User features to python object
-'''''''''''''''''''''''''''''''''''''''''''''
-
-There are many features in movie or user in movielens 1m dataset.
-Each line of rating file just provides a Movie/User id to refer each movie or user.
-We process the movie/user feature file first, and pickle the feature (**Meta**) object as a file.
-
-Meta config file
-................
-
-**Meta config file** is used to specific **HOW** to parse each field in dataset.
-It could be translated from field config file, or written by hand.
-Its file format could be either json or yaml syntax file. Parser will automatically choose the file format by extension name.
-
-To convert Field config file to meta config file, just run:
-
-..  code-block:: bash
-
-    cd demo/recommendation/data
-    python config_generator.py config.json > meta_config.json
-
-The meta config file shows below:
-
-..  include:: ../../../demo/recommendation/data/meta_config.json
-    :code: json
-    :literal:
-
-There are two kinds of features in meta\: movie and user.
-
-* in movie file, whose name is movies.dat
-   * we just split each line by "::"
-   * pos 0 is id.
-   * pos 1 feature:
-      * name is title.
-      * it uses regex to parse this feature.
-      * it is a char based word embedding feature.
-      * it is a sequence.
-   * pos 2 feature:
-      * name is genres.
-      * type is one hot dense vector.
-      * dictionary is auto generated by parsing, each key is split by '|'
-* in user file, whose name is users.dat
-   * we just split each line by "::"
-   * pos 0 is id.
-   * pos 1 feature:
-       * name is gender
-       * just simple char based embedding.
-   * pos 2 feature:
-       * name is age
-       * just whole word embedding.
-       * embedding id will be sort by word.
-   * pos 3 feature:
-       * name is occupation.
-       * just simple whole word embedding.
-
-
-Meta file
-'''''''''
-
-After having meta config file, we can generate **Meta file**, a python pickle object which stores movie/user information.
-The following commands could be run to generate it.
-
-..  code-block:: bash
-
-    python meta_generator.py ml-1m meta.bin --config=meta_config.json
-
-And the structure of the meta file :code:`meta.bin` is:
-
-..  code-block:: text
-
-    +--+ movie
-    |      +--+ __meta__
-    |      |       +--+ raw_meta  # each feature meta config. list
-    |      |       |       +
-    |      |       |       |     # ID Field, we use id as key
-    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
-    |      |       |       |
-    |      |       |       |     # Titile field, the dictionary list of embedding.
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
-    |      |       |       |
-    |      |       |       |     # Genres field, the genres dictionary
-    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
-    |      |       |
-    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
-    |      |                               # it means there are 2 features for each key.
-    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
-    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
-    |      |
-    |      +--+ 1 # movie 1 features
-    |      |    +
-    |      |    +---+ [[...], [...]] # title ids, genres dense vector
-    |      |
-    |      +--+ 2
-    |      |
-    |      +--+ ...
-    |
-    +--- user
-           +--+ __meta__
-           |       +
-           |       +--+ raw_meta
-           |       |       +
-           |       |       +--+ id field as user
-           |       |       |
-           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
-           |       |       |
-           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
-           |       |
-           |       +--+ feature_map [1, 2, 3]
-           |
-           +--+ 1 # user 1 features
-           |
-           +--+ 2
-           +--+ ...
-
-
-Split Training/Testing files
-''''''''''''''''''''''''''''
-
-We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
-rating by two parts. So each user in testing file will have some rating information in training file.
-
-Use :code:`separate.py` to separate the training and testing file.
-
-..  code-block:: bash
-
-    python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
-
-Then two files will be generated\: :code:`ml-1m/ratings.dat.train` and :code:`ml-1m/rating.data.test`.
-Move them to workspace :code:`data`, shuffle the train file, and prepare the file list for paddle train.
-
-..  code-block:: bash
-
-    shuf ml-1m/ratings.dat.train > ratings.dat.train
-    cp ml-1m/ratings.dat.test .
-    echo "./data/ratings.dat.train" > train.list
-    echo "./data/ratings.dat.test" > test.list
-
-
-Neural Network Configuration
-````````````````````````````
-
-Trainer Config File
-'''''''''''''''''''
-
-The network structure shows below.
-
-..  image:: rec_regression_network.png
-    :align: center
-    :alt: rec_regression_network
-
-The demo's neural network config file :code:`trainer_config.py` show as below.
-
-..  literalinclude:: ../../../demo/recommendation/trainer_config.py
-    :language: python
-    :lines: 15-
-
-In this :code:`trainer_config.py`, we just map each feature type to
-a feature vector, following shows how to map each feature to a vector shows below.
-
-* :code:`id`\: Just simple embedding, and then add to fully connected layer.
-* :code:`embedding`\:
-    - if is_sequence, get the embedding and do a text convolutional operation,
-      get the average pooling result.
-    - if not sequence, get the embedding and add to fully connected layer.
-* :code:`one_host_dense`\:
-    - just two fully connected layer.
-
-Then we combine each features of movie into one movie feature by a
-:code:`fc_layer` with multiple inputs, and do the same thing to user features,
-get one user feature. Then we calculate the cosine similarity of these two
-features.
-
-In these networks, we use several APIs in :ref:`api_trainer_config` . There are
-
-*  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
-*  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
-*  Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer`
-*  Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection`
-*  Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer`
-*  Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim`
-*  Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool`
-*  Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`.
-
-Data Provider
-'''''''''''''
-
-..  literalinclude:: ../../../demo/recommendation/dataprovider.py
-    :language: python
-    :lines: 15-
-
-The data provider just read the meta.bin and rating file, yield each sample for training.
-In this :code:`dataprovider.py`, we should set\:
-
-* obj.slots\: The feature types and dimension.
-* use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
-* process\: Return each sample of data to :code:`paddle`.
-
-The data provider details document see :ref:`api_pydataprovider2`.
-
-Train
-`````
-
-After prepare data, config network, writting data provider, now we can run paddle training.
-
-The :code:`run.sh` is shown as follow:
-
-..  literalinclude:: ../../../demo/recommendation/run.sh
-    :language: bash
-    :lines: 16-
-
-It just start a paddle training process, write the log to :code:`log.txt`,
-then print it on screen.
-
-Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.
-
-*  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into :code:`./output`.
-*  use_gpu\: Use gpu or not. Default is false.
-*  trainer_count\: The compute thread in one machine.
-*  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
-   will test a :code:`batch_size` data in one test period.
-*  log_period\: Print log after train :code:`log_period` batches.
-*  dot_period\: Print a :code:`.` after train :code:`dot_period` batches.
-*  num_passes\: Train at most :code:`num_passes`.
-
-If training process starts successfully, the output likes follow:
-
-..  code-block:: text
-
-    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
-
-    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
-
-    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
-
-    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
-
-    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
-    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
-    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
-    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
-    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
-    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
-    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
-
-The model is saved in :code:`output/` directory. You can use :code:`Ctrl-C` to stop training whenever you want.
-
-Evaluate and Predict
-````````````````````
-
-After training several passes, you can evaluate them and get the best pass. Just run
-
-.. code-block:: bash
-
-    ./evaluate.sh 
-
-You will see messages like this:
-
-.. code-block:: text
-
-    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
-    evaluating from pass output/pass-00009
-
-Then, you can predict what any user will rate a movie. Just run
-
-..  code-block:: bash
-
-    python prediction.py 'output/pass-00009/'
-
-Predictor will read user input, and predict scores. It has a command-line user interface as follows:
-
-..  code-block:: text
-
-    Input movie_id: 9
-    Input user_id: 4
-    Prediction Score is 2.56
-    Input movie_id: 8
-    Input user_id: 2
-    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/rec_regression_network.png b/doc/tutorials/rec/rec_regression_network.png
deleted file mode 100644
index 7d2b54d4fcf560cd5b667628f0012c3822efd9b2..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/rec/rec_regression_network.png and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/feature.jpg b/doc/tutorials/semantic_role_labeling/feature.jpg
deleted file mode 100644
index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/semantic_role_labeling/feature.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md
deleted file mode 100644
index f6061766c038a7bb6e4ae376685a10cd5669d2ed..0000000000000000000000000000000000000000
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# 语义角色标注教程 #
-
-语义角色标注（Semantic role labeling, SRL）是浅层语义解析的一种形式，其目的是在给定的输入句子中发现每个谓词的谓词论元结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的，如信息提取、文档自动分类和问答。 实例如下 [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: 动词
-- A0: 接受者
-- A1: 接受的东西
-- A2: 从……接受
-- A3: 属性
-- AM-MOD: 情态动词 
-- AM-NEG: 否定
-
-给定动词“accept”，句子中的组块将会扮演某些语义角色。这里，标签方案来自 Penn Proposition Bank。
-
-到目前为止，大多数成功的SRL系统是建立在某种形式的句法分析结果之上的，使用了基于句法结构的预定义特征模板。 本教程将介绍使用深度双向长短期记忆（DB-LSTM）模型[2]的端到端系统来解决SRL任务，这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标注问题。
-
-## 数据描述
-相关论文[2]采用 CoNLL-2005＆2012 共享任务中设置的数据进行训练和测试。由于数据许可的原因，演示采用 CoNLL-2005 的测试数据集，可以在网站上找到。
-
-用户只需执行以下命令就可以下载并处理原始数据：
-
-```bash
-cd data
-./get_data.sh
-```
-`data `目录会出现如下几个新的文件：
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## 训练
-### DB-LSTM
-请参阅情感分析的演示以了解有关长期短期记忆单元的更多信息。
-
-与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同，DB-LSTM 采用另一种方法来堆叠LSTM层。首先，标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入，并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。
-
-下图展示了时间扩展的2层 DB-LSTM 网络。
-<center>
-![pic](./network_arch.png)
-</center>
-
-### 特征
-两个输入特征在这个流程中起着至关重要的作用：predicate（pred）和argument（arguments）。 还采用了两个其他特征：谓词上下文（ctx-p）和区域标记（mr）。 因为单个谓词不能精确地描述谓词信息，特别是当相同的词在句子中出现多于一次时。 使用谓词上下文，可以在很大程度上消除歧义。类似地，如果它位于谓词上下文区域中，则使用区域标记 m<sub>r</sub> = 1 来表示参数位置，反之则 m<sub>r</sub> = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示：
-<center>
-![pic](./feature.jpg)
-</center>
-
-在这个示例中，相应的标记句子是：
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-在演示中, 我们采用上面的特征模板, 包括：  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。
-
-### 数据提供
-
-`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-相应的数据迭代器如下：
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-函数 `process` 返回8个特征list和1个标签list。
-
-### 神经网络配置
-
-`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。
-
-九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为向量，并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。
-
-### 训练 
-训练的脚本是 `train.sh`，用户只需执行:
-```bash
-  ./train.sh
-```
-`train.sh` 中的内容：
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : 网络配置文件
--  \--use_gpu=false: 使用 CPU 训练（如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true，目前 crf_layer 不支持 GPU）
--  \--log_period=500: 每20个batch输出日志
--  \--trainer_count=1: 设置线程数（或 GPU 数）
--  \--show_parameter_stats_period=5000: 每100个batch显示参数统计
--  \--save_dir=./output: 模型输出路径
--  \--num_passes=10000: 设置数据遍历次数，一个pass意味着PaddlePaddle训练数据集中的所有样本被遍历一次
--  \--average_test_period=10000000:  每个 average_test_period 批次对平均参数进行测试
--  \--init_model_path=./data: 参数初始化路径
--  \--load_missing_parameter_strategy=rand: 随机初始不存在的参数
--  \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-
-
-训练后，模型将保存在目录`output`中。 我们的训练曲线如下：
-<center>
-![pic](./src/curve.jpg)
-</center>
-
-### 测试
-测试脚本是 `test.sh`, 执行:
-```bash
-  ./test.sh
-```
-`tesh.sh` 的主要部分：
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: 网络配置文件
-  - \--model_list=$model_list.list: 模型列表文件
-  - \--job=test: 指示测试任务
-  - \--config_args=is_test=1: 指示测试任务的标记
-  - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-  
-
-### 预测
-预测脚本是 `predict.sh`，用户只需执行：
-```bash
-  ./predict.sh
-  
-```
-在`predict.sh`中，用户应该提供网络配置文件，模型路径，标签文件，字典文件，特征文件。
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` 是主要的可执行python脚本，其中包括函数：加载模型，加载数据，数据预测。网络模型将输出标签的概率分布。 在演示中，我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现柱搜索或维特比解码。
-
-预测后，结果保存在 `predict.res` 中。
-
-## 引用
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md
deleted file mode 100644
index 92d7c634832119c718711a57c16f69492d405f28..0000000000000000000000000000000000000000
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ /dev/null
@@ -1,204 +0,0 @@
-```eval_rst
-..  _semantic_role_labeling:
-```
-
-# Semantic Role labeling Tutorial #
-
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
-
- [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 
-
-- V: verb
-- A0: acceptor
-- A1: thing accepted
-- A2: accepted-from
-- A3: Attribute
-- AM-MOD: modal 
-- AM-NEG: negation
-
-Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 
-
-To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 
-
-## Data Description
-The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.
-
-To download and process the original data, user just need to execute the following command:
-
-```bash
-cd data
-./get_data.sh
-```
-Several new files appear in the `data `directory as follows.
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## Training
-### DB-LSTM
-Please refer to the Sentiment Analysis demo to learn more about the long short-term memory unit. 
-
-Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM adopts another way to stack LSTM layer. First a standard LSTM processes the sequence in forward direction. The input and output of this LSTM layer are taken by the next LSTM layer as input, processed in reversed direction. These two standard LSTM layers compose a pair of LSTM. Then we stack LSTM layers pair after pair to obtain the deep LSTM model. 
-
-The following figure shows a temporal expanded 2-layer DB-LSTM network.
-<center>
-![pic](./src/network_arch.png)
-</center>
-
-### Features
-Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
-<center>
-![pic](./src/feature.jpg)
-</center>
-
-In this sample, the coresponding labelled sentence is:
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-In the demo, we adopt the feature template as above, consists of :  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` and use `B/I/O` scheme to label each argument. These features and labels are stored in `feature` file, and separated by `\t`.
-
-### Data Provider
-
-`dataprovider.py` is the python file to wrap data. `hook()` function is to define the data slots for network. The  Six features and label are all IndexSlots.
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-The corresponding data iterator is as following:
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-The `process`function yield 9 lists which are 8 features and label.
- 
-### Neural Network Config
-`db_lstm.py` is the neural network config file to load the dictionaries and define the  data provider module and network architecture during the training procedure. 
-
-Nine `data_layer` load instances from data provider. Eight features are transformed into embedddings respectively, and mixed by `mixed_layer` .  Deep bidirectional LSTM layers extract features for the softmax layer. The objective function is cross entropy of labels.
-
-### Run Training 
-The script for training is `train.sh`, user just need to execute:
-```bash
-  ./train.sh
-```
-The content in `train.sh`:
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : network config file.
--  \--use_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train, until now crf_layer do not support GPU
--  \--log_period=500: print log every 20 batches.
--  \--trainer_count=1: set thread number (or GPU count).
--  \--show_parameter_stats_period=5000: show parameter statistic every 100 batches.
--  \--save_dir=./output: output path to save models.
--  \--num_passes=10000: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
--  \--average_test_period=10000000:  do test on average parameter every average_test_period batches
--  \--init_model_path=./data: parameter initialization path 
--  \--load_missing_parameter_strategy=rand: random initialization unexisted parameters
--  \--test_all_data_in_one_period=1: test all data in one period
-
-
-After training, the models  will be saved in directory `output`. Our training curve is as following:
-<center>
-![pic](./src/curve.jpg)
-</center>
-
-### Run testing
-The script for testing is `test.sh`, user just need to execute:
-```bash
-  ./test.sh
-```
-The main part in `tesh.sh`
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: network config file
-  - \--model_list=$model_list.list: model list file
-  - \--job=test: indicate the test job
-  - \--config_args=is_test=1: flag to indicate test
-  - \--test_all_data_in_one_period=1: test all data in 1 period
-  
-
-### Run prediction
-The script for prediction is `predict.sh`, user just need to execute:
-```bash
-  ./predict.sh
-  
-```
-In `predict.sh`, user should offer the network config file, model path, label file, word dictionary file, feature file
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` is the main executable python script, which includes functions: load model, load data, data prediction. The network model will output the probability distribution of labels. In the demo, we take the label with maximum probability as result. User can also implement the beam search or viterbi decoding upon the probability distribution matrix.
-
-After prediction,  the result is saved in `predict.res`.
-
-## Reference
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/network_arch.png b/doc/tutorials/semantic_role_labeling/network_arch.png
deleted file mode 100644
index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/semantic_role_labeling/network_arch.png and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg
deleted file mode 100644
index baa35ae7f0a0b6c246f3a0d331735477ab8bcd70..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/curve.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg
deleted file mode 100644
index 0e3310e4ace5613917e7779d3198ccbb3cdc5ada..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/feature.jpg and /dev/null differ
diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png
deleted file mode 100644
index 4ae7864212f2a0a38102ee7ff600527ea99fec82..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/semantic_role_labeling/src/network_arch.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/index_cn.md b/doc/tutorials/sentiment_analysis/index_cn.md
deleted file mode 100644
index 1323ec1a6abb2e7b5eeb2fbfff9cce5fe78a2c06..0000000000000000000000000000000000000000
--- a/doc/tutorials/sentiment_analysis/index_cn.md
+++ /dev/null
@@ -1,325 +0,0 @@
-# 情感分析教程
-
-情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
-
-情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
-
-另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
-
-本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
-
-## 数椐准备
-
-### IMDB 数椐介绍
-
-训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: 训练数椐集。
-* test : 测试数椐集。
-* imdb.vocab: 字典文件。
-* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
-* README: 数椐说明文档。
-
-测试集和训练集目录包含下面的文件:
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* unsup: 未标记的评价样本，包含50,000个txt文件。
-* urls_xx.txt: 每个评论的网址。
-* xxBow.feat: 用于统计词频的Bow模型特征。
-
-### IMDB 数椐准备
-
-在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: 输入数椐所在目录。
-* preprocess.py: 预处理脚本。
-
-运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
-* train.list and test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
-
-### 用户自定义数椐预处理
-
-如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 一级目录。
-* train, test: 二级目录。
-* class1,class2,...: 三级目录。
-* text_files: 文本格式的实例文件。
-
-所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
-
-## 训练模型
-
-在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
-
-<center>![LSTM](src/lstm.png)</center>
-<center>图表 1. LSTM [3]</center>
-
-情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
-
-在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
-
-#### 双向LSTM
-
-图2是双向LSTM网络，后面连全连接层和softmax层。
-
-<center>![BiLSTM](src/bi_lstm.jpg)</center>
-<center>图 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
-
-<center>![StackedLSTM](src/stacked_lstm.jpg)</center>
-<center>图 3. Stacked-LSTM for sentiment analysis </center>
-
-**配置**
-
-进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **数椐定义**:
-   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
-   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
-
-* **算法配置**:
-   * 使用随机梯度下降（sgd）算法。
-   * 使用 adam 优化。
-   * 设置batch size大小为128。
-   * 设置平均sgd窗口。
-   * 设置全局学习率。
-* **网络配置**:
-   * dict_dim: 获取字典维度。
-   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
-   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
-   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
-
-**训练**
-
-首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: 设置网络配置。
-* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
-* \--trainer\_count=4:设置线程数（或GPU个数）。
-* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: 表示训练了xx个Batch。
-- samples=xx: 表示训练了xx个样本。。
-- AvgCost=xx: 从第0个batch到当前batch的平均损失。
-- CurrentCost=xx: 最新log_period个batch处理的当前损失。
-- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
-
-默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
-
-## 测试模型
-
-测试模型是指使用训练出的模型评估已标记的验证集。
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## 预测
-
-`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
-* `predict.py` : 预测接口脚本。
-* `--tconf=$config` : 设置网络配置。
-* `--model=$model` : 设置模型路径。
-* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
-* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
-* `--batch_size=1` : 设置batch size。
-
-注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
-
-本示例的预测结果：
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-我们真诚地感谢您的关注，并欢迎您来参与贡献。
-
-## 参考文档
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md
deleted file mode 100644
index bb7681db44ca6f286ad6935ddfecb9becb429192..0000000000000000000000000000000000000000
--- a/doc/tutorials/sentiment_analysis/index_en.md
+++ /dev/null
@@ -1,328 +0,0 @@
-# Sentiment Analysis Tutorial
-
-Sentiment analysis has many applications. A basic task in sentiment analysis is classifying the polarity of a given text at the document, sentence or feature/aspect level. One simple example is to classify the customer reviews in a shopping website, a tourism website, and group buying websites like Amazon, TaoBao, Tmall etc.
-
-Sentiment analysis is also used to monitor social media based on large amount of reviews or blogs. For example, the researchers analyzed several surveys on consumer confidence and political opinion, found they correlate to sentiment word frequencies in contemporaneous Twitter messages [1]. Another example is to forecast stock movements through analyzing the text content of a daily Twitter blog [2].
-
-On the other hand, grabbing the user comments of products and analyzing their sentiment are useful to understand user preferences for companies, products, even competing products.
-
-This tutorial will guide you through the process of training a Long Short Term Memory (LSTM) Network to classify the sentiment of sentences from [Large Movie Review Dataset](http://ai.stanford.edu/~amaas/data/sentiment/), sometimes known as the Internet Movie Database (IMDB). This dataset contains movie reviews along with their associated binary sentiment polarity labels, namely positive and negative. So randomly guessing yields 50% accuracy.
-
-## Data Preparation
-
-### IMDB Data Introduction
-
-Before training models, we need to preprocess the data and build a dictionary. First, you can use following script to download IMDB dataset and [Moses](http://www.statmt.org/moses/) tool, which is a statistical machine translation system. We provide a data preprocessing script, which is capable of handling not only IMDB data, but also other user-defined data. In order to use the pre-written script, it needs to move labeled train and test samples to another path, which has been done in `get_imdb.sh`.
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-If the data is obtained successfuly, you will see the following files at ```./demo/sentiment/data```:
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: raw dataset downloaded from website.
-* imdb: only contains train and test data.
-* mosesdecoder-master: Moses tool.
-
-IMDB dataset contains 25,000 highly polar movie reviews for training, and 25,000 for testing. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. After running `./get_imdb.sh`, we can find the dataset has the following structure in `aclImdb`.
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: train sets.
-* test : test sets.
-* imdb.vocab: dictionary.
-* imdbEr.txt: expected rating for each token in imdb.vocab.
-* README: data documentation.
-
-The file in train set directory is as follows. The test set also contains them except `unsup` and `urls_unsup.txt`.
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: positive samples, contains 12,500 txt files, each file is one movie review.
-* neg: negative samples, contains 12,500 txt files, each file is one movie review.
-* unsup: unlabeled samples, contains 50,000 txt files.
-* urls_xx.txt: urls of each reviews.
-* xxBow.feat: already-tokenized bag of words (BoW) features.
-
-### IMDB Data Preparation
-
-In this demo, we only use labled train and test set and not use imdb.vocab as dictionary. By default, dictionary is builded on train set. Train set is shuffled and test set is not. `tokenizer.perl` in Moses tool is used to tokenize the words and punctuation. Simply execute the following command to preprcess data.
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: input data directory.
-* preprocess.py: preprocess script.
-
-If running successfully, you will see `demo/sentiment/data/pre-imdb` directory as follows:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: all labeled test and train sets. Train sets have be shuffled.
-* train.list and test.list: train and test file lists.
-* dict.txt: dictionary generated on train sets by default.
-* labels.txt: neg  0, pos 1, means label 0 is negative review, label 1 is positive review.
-
-### User-defined Data Preparation
-
-If you perform other sentiment classifcation task, you can prepare data as follows. We have provided the scripts to build dictionary and preprocess data. So just organize data as follows.
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 1st directory.
-* train, test: 2nd directory.
-* class1,class2,...: 3rd directory.
-* text_files: samples with text file format.
-
-All samples with text files format under the same folder are same category. Each text file contains one or more samples and each line is one sample. In order to shuffle fully, the preprocessing is a little different for data with multiple lines in one text file, which needs to set `-m True` in `preprocess.sh`. And tokenizer.perl is used by default. If you don't need it, only set `-t False` in `preprocess.sh'.
-
-## Training
-
-In this task, we use Recurrent Neural Network (RNN) of LSTM architecure to train sentiment analysis model. LSTM model was introduced primarily in order to overcome the problem of vanishing gradients. LSTM network resembles a standard recurrent neural network with a hidden layer, but each ordinary node in the hidden layer is replaced by a memory cell. Each memory cell contains four main elements: an input gate, a neuron with a self-recurrent connection, a forget gate and an output gate. More details can be found in the literature [4]. The biggest advantage of the LSTM architecture is that it learns to memorize information over long time intervals without the loss of short time memory. At each time step with a new coming word, historical information stored in the memory block is updated to iteratively learn the sequence representation.
-
-<center>![LSTM](./lstm.png)</center>
-<center>Figure 1. LSTM [3]</center>
-
-Sentiment analysis is among the most typical problems in natural language understanding. It aims at predicting the attitude expressed in a sequence. Usually, only some key words, like adjectives and adverbs words, play a major role in predicting the sentiment of sequences or paragraphs. However, some review or comment contexts are very long, such as IMDB dataset. We use LSTM to perform this task for its improved design with the gate mechanism. First, it is able to summarize the representation from word level to context level with variable context length which is adapted by the gate values. Second, it can utilize the expanded context at the sentence level, while most methods are good at utilizing n-gram level knowledge. Third, it learns the paragraph representation directly rather than combining the context level information. This results in this end-to-end framework.
-
-In this demo we provide two network, namely bidirectional-LSTM and three layers of stacked-LSTM.
-
-#### Bidirectional-LSTM
-
-One is a bidirectional LSTM network, connected by fully connected layer and softmax, as shown in Figure 2.
-
-<center>![BiLSTM](./bi_lstm.jpg)</center>
-<center>Figure 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-Another is three-layer LSTM structure in Figure 3. The bottom of the figure is word embedding. Next, three LSTM-Hidden layers are connected and the second LSTM is reversed. Then extract the maximum hidden vectors of all time step of hidden and LSTM layer as the representation for the entire sequence. Finally, a fully connected feed forward layer with softmax activation is used to perform the classification task. This network is refered to paper [5].
-
-<center>![StackedLSTM](./stacked_lstm.jpg)</center>
-<center>Figure 3. Stacked-LSTM for sentiment analysis </center>
-
-**Config**
-
-Switch into `demo/sentiment` directory, `trainer_config.py` file is an example of the config, containing algorithm and newtork configure. The first line imports predefined networks from `sentiment_net.py`.
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  average_window=0.5,
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **Data Definition**:
-   * get\_config\_arg(): get arguments setted by `--config_args=xx` in commandline argument.
-   * Define data provider, here using Python interface to load data. For details, you can refer to the document of PyDataProvider2.
-
-* **Algorithm Configuration**:
-   * set batch size of 128.
-   * set global learning rate.
-   * use adam optimization.
-   * set average sgd window.
-   * set L2 regularization.
-   * set gradient clipping threshold.
-* **Network Configuration**:
-   * dict_dim: dictionary dimension.
-   * class_dim: category number, IMDB has two label, namely positive and negative label.
-   * `stacked_lstm_net`: predefined network as shown in Figure 3, use this network by default.
-   * `bidirectional_lstm_net`: predefined network as shown in Figure 2.
-
-**Training**
-
-Install PaddlePaddle first if necessary. Then you can use script `train.sh` as follows to launch local training.
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: set network config.
-* \--save\_dir=$output: set output path to save models.
-* \--job=train: set job mode to train.
-* \--use\_gpu=false: use CPU to train, set true, if you install GPU version of PaddlePaddle and want to use GPU to train.
-* \--trainer\_count=4: set thread number (or GPU count).
-* \--num\_passes=15: set pass number, one pass in PaddlePaddle means training all samples in dataset one time.
-* \--log\_period=20: print log every 20 batches.
-* \--show\_parameter\_stats\_period=100: show parameter statistic every 100 batches.
-* \--test\_all_data\_in\_one\_period=1: test all data every testing.
-
-If the run succeeds, the output log is saved in path of `demo/sentiment/train.log` and model is saved in path of `demo/sentiment/model_output/`. The output log is explained as follows.
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: means passing xx batches.
-- samples=xx: means passing xx samples.
-- AvgCost=xx: averaged cost from 0-th batch to current batch.
-- CurrentCost=xx: current cost of latest log_period batches.
-- Eval: classification\_error\_evaluator=xx: means classfication error from 0-th batch ro current batch.
-- CurrentEval: classification\_error\_evaluator: current classfication error of the lates log_period batches.
-- Pass=0: Going through all training set one time is called one pass. 0 means going through training set first time.
-
-By default, we use the `stacked_lstm_net` network, which converges at a faster rate than `bidirectional_lstm_net` when passing same sample number. If you want to use bidirectional LSTM, just remove comment in the last line and comment `stacked_lstm_net`.
-
-## Testing
-
-Testing means evaluating the labeled validation set using trained model.
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-The function `get_best_pass` gets the best model by classification error rate for testing. In this example, We use test dataset of IMDB as validation by default. Unlike training, it needs to specify `--job=test` and model path, namely `--model_list=$model_list` here. If running successfully, the log is saved in path of `demo/sentiment/test.log`. For example, in our test, the best model is `model_output/pass-00002`, the classification error is 0.115645 as follows.
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## Prediction
-
-`predict.py` provides a predicting interface. You should install python api of PaddlePaddle before using it. One example to predict unlabeled review of IMDB is as follows. Simply running:
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
-     --tconf=$config\
-     --model=$model \
-     --label=$label \
-     --dict=./data/pre-imdb/dict.txt \
-     --batch_size=1
-```
-
-* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample.
-* `predict.py` : predicting interface.
-* `--tconf=$config` : set network configure.
-* ` --model=$model` : set model path.
-* `--label=$label` : set dictionary about corresponding relation between integer label and string label.
-* `--dict=data/pre-imdb/dict.txt` : set dictionary.
-* `--batch_size=1` : set batch size.
-
-Note you should make sure the default model path `model_output/pass-00002`
-exists or change the model path.
-
-Predicting result of this example:
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-We sincerely appreciate your interest and welcome your contributions.
-
-## Reference
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/lstm.png b/doc/tutorials/sentiment_analysis/lstm.png
deleted file mode 100644
index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/sentiment_analysis/lstm.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
deleted file mode 100644
index adec1606d64d6e35ffe7e62abfa9a09309b05c84..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png
deleted file mode 100644
index aaf1fc690da2ffb8418cde5ed81848ddb5263030..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/sentiment_analysis/src/lstm.png and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
deleted file mode 100644
index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/stacked_lstm.jpg
deleted file mode 100644
index 4239055050966e0095e188a8c81d860711bce29d..0000000000000000000000000000000000000000
Binary files a/doc/tutorials/sentiment_analysis/stacked_lstm.jpg and /dev/null differ
diff --git a/doc/tutorials/text_generation/index_cn.md b/doc/tutorials/text_generation/index_cn.md
deleted file mode 100644
index 41a87b926db399d692d677e5278e7d5a0b7b5594..0000000000000000000000000000000000000000
--- a/doc/tutorials/text_generation/index_cn.md
+++ /dev/null
@@ -1,339 +0,0 @@
-# 文本生成教程 #
-
-在语言生成领域中，“序列到序列”（sequence to sequence）的方法已被证明是一种强大的模型。它可以被应用于进行机器翻译（machine translation）、query改写（query rewriting）、图像描述（image captioning）等等。
-
-本篇教程将会指导你通过训练一个“序列到序列”的神经网络机器翻译（NMT）模型来将法语翻译成英语。
-
-我们遵循 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) 这篇文章，其中详细说明了模型架构，以及在WMT-14数据集上得到良好表现的训练过程。本篇教程在PaddlePaddle中重现了这一良好的训练结果。
-
-我们感谢@caoying的pull request，其中定义了模型架构和solver配置。
-
-## 数据准备 ##
-### 下载与解压缩 ###
-从该链接 [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/) 下载WMT-14数据集，然后解压，并将Develop和Test数据分别放入不同的文件夹。
-
-- **Train data**: [bitexts (选择过后的)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
-- **Develop and Test data**: [dev 与 test 数据](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
-
-在Linux下，只需要简单地运行以下命令。否则你需要自己下载、解压、拆分到不同文件夹、并且分别重命名文件后缀。
-
-```bash
-cd demo/seqToseq/data
-./wmt14_data.sh
-```
-
-我们会发现数据集 `wmt14` 中包含如下表所示的3个文件夹。
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-
-<thead>
-<tr>
-<th scope="col" class="left">folder name</th>
-<th scope="col" class="left">French-English parallel corpora file</th>
-<th scope="col" class="left">number of total file</th>
-<th scope="col" class="left">size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">train_data</td>
-<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td class="left">12</td>
-<td class="left">3.55G</td>
-</tr>
-
-<tr>
-<td class="left">test_data</td>
-<td class="left">ntst1213.src, ntst1213.trg</td>
-<td class="left">2</td>
-<td class="left">1636k</td>
-</tr>
-
-<tr>
-<td class="left">gen_data</td>
-<td class="left">ntst14.src, ntst14.trg</td>
-<td class="left">2</td>
-<td class="left">864k</td>
-</tr>
-</tbody>
-</table>
-<br/>
-
-- 每个文件夹都包含法语到英语的平行语料库
-- **XXX.src** 是原始法语文件；**XXX.trg** 是目标英语文件
-- **XXX.src** 和 **XXX.trg** 的行数应该一致
-- 每行都是一个法语或者英语的句子
-- **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都有着一一对应的关系
-
-### 用户自定义数据集 ###
-
-如果你想进行诸如语义转述（Paraphrasing）等其他“序列到序列”的任务，你只需要按照如下方式组织数据，并将它们放在`demo/seqToseq/data`目录下：
-
-    dataset
-      train
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      test
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      gen
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-  
-- 一级目录：数据集文件夹名称
-- 二级目录：train、test和gen这三个文件夹是固定的
-- 三级目录：源语言到目标语言的平行语料库文件
-  - **XXX.src** 是源语言的文件，**XXX.trg** 时目标语言的文件
-  - 文件中的每行都必须是一个句子
-  - **XXX.src** 和 **XXX.trg** 中任意第i行的句子之间都必须有着一一对应的关系
-
-## 数据预处理 ##
-### 预处理工作流程 ###
-- 将每个源语言到目标语言的平行语料库文件合并为一个文件：
-  - 合并每个 **XXX.src** 和 **XXX.trg** 文件为 **XXX**
-  - **XXX** 中的第i行 = **XXX.src** 中的第i行 + '\t' + **XXX.trg**中的第i行
-- 创建训练数据的“源字典”和“目标字典”，每个字典都有DICTSIZE个单词，包括：
-  - 词频最高的（DICTSIZE - 3）个单词
-  - 3个特殊符号
-  - `<s>`：序列的开始
-  - `<e>`：序列的结束
-  - `<unk>`：未包含在字典中的单词
-
-### 预处理命令和结果
-对数据集进行预处理的基本命令是：
-
-```python
-cd demo/seqToseq/
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-
-- `-i INPUT`：输入的原始数据集路径
-- `-d DICTSIZE`：指定的字典单词数，如果没有设置，字典会包含输入数据集中的所有单词
-- `-m --mergeDict`：合并 “源字典”和“目标字典”，使得两个字典有相同的上下文
-
-你将会看到如下消息：
-
-    concat parallel corpora for dataset
-    build source dictionary for train data
-    build target dictionary for train data
-    dictionary size is XXX
-
-然后你只需要运行以下命令：
-
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-
-这将花费数分钟的时间，并且将预处理好的数据集存放在`demo/seqToseq/data/pre-wmt14`目录下。目录结构如下：
-
-    train test gen train.list test.list gen.list src.dict trg.dict# Text generation Tutorial #
-
-- **train, test, gen**：分别包含了法语到英语的平行语料库的训练数据、测试数据和生成数据。文件夹中的每个文件的每一行包含两部分，首先是法语序列，然后是对应的英语序列。
-- **train.list, test.list, gen.list**：分别为train，test，gen文件夹中的文件列表
-- **src.dict, trg.dict**：源（法语）/目标（英语）字典，每个字典包含总共30000个单词：29997个最高频单词和3个特殊符号
-
-## 模型训练 ##
-### 简介###
-
-神经网络机器翻译（NMT）旨在建立一个可以被协同调至最优翻译效果的单神经元网络。近期提出的NMT模型通常都属于编解码模型（encoder–decoder models）的一种。编解码模型将一个源语句编码为一个定长的向量，然后解码器通过这个向量生成一个目标语句。
-
-在这个任务中，我们使用了一个编解码模型的扩展，它同时学习排列(align)与翻译。每当模型在翻译过程中生成了一个单词，它就会在源语句中搜索出最相关信息的位置的集合。解码器根据上下文向量预测出一个目标单词，这个向量与源中搜索出的位置和所有之前生成的目标单词有关。如想了解更多详细的解释，可以参考 [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473)。
-
-这个模型对于编解码模型来说，最不同的特色是它并没有将输入语句编码为一个单独的定长向量。相反，它将输入语句编码为向量的序列，其中每个向量对应输入语句中的一个元素。然后在解码被翻译的语句时，会自适应地从这些向量中选择一个子集出来。这使得NMT模型得以解放出来，不必再将任意长度源语句中的所有信息压缩至一个定长的向量中。该模型在长语句翻译的场景下效果提升更加明显，在任意长度语句翻译的场景下都可以观察到其效果的提升。
-<center>![](./encoder-decoder-attention-model.png)</center>
-<center>Figure 1. Encoder-Decoder-Attention-Model</center>
-
-### 使用PaddlePaddle训练模型 ###
-我们在训练之前需要常见一个模型配置文件，这里是一个例子`demo/seqToseq/translation/train.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
-
-```python
-from seqToseq_net import *
-is_generating = False
-
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
-```
-
-1. **Data Definiation**：在示例中我们定义了一个序列到序列的训练和测试数据。它返回train_conf作为配置，其输入参数如下：
-  - data_dir：训练数据和测试数据的目录
-  - is_generating：这个配置是否用来生成，这里设置为False
-2. **Algorithm Configuration**：在示例中我们使用SGD训练算法（默认），和ADAM学习方法，指定batch_size为50，learning_rate为5e-4
-3. **Network Architecture**：在示例中我们使用attention版本的GRU编解码网络。它包括了一个双向的GRU作为编码器和解码器，它模拟了解码翻译过程中在源语句中的搜索。
-
-### 训练模型的命令与结果###
-写完模型配置之后，我们可以通过以下命令来训练模型：
-
-```bash
-cd demo/seqToseq/translation
-./train.sh
-```
-
-`train.sh` 的内容如下所示：
-
-```bash
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-```
-- config: 设置神经网络的配置文件
-- save_dir: 设置保存模型的输出路径
-- use_gpu: 是否使用GPU训练，这里设置为使用CPU
-- num_passes: 设置passes的数量。paddle中的一条pass表示训练数据集中所有的样本一次
-- show_parameter_stats_period: 这里每隔100个batch显示一次参数统计信息
-- trainer_count: 设置CPU线程数或者GPU设备数
-- log_period: 这里每隔10个batch打印一次日志
-- dot_period: 这里每个5个batch打印一个点"."
-
-训练的损失函数默认每隔10个batch打印一次，你将会看到如下消息：
-
-    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-    .....
-- AvgCost：从第0个batch到当前batch的平均cost
-- CurrentCost:：当前batch的cost
-- classification\_error\_evaluator(Eval)：从第0个评估到当前评估中，每个单词的预测错误率
-- classification\_error\_evaluator(CurrentEval)：当前评估中，每个单词的预测错误率
-
-当classification\_error\_evaluator的值低于0.35时，模型就训练成功了。
-
-## 文本生成 ##
-### 简介###
-
-一般而言，NMT模型受制于源语句的编码，并且通过给出当前目标单词来预测下一个目标单词。在训练过程中，当前单词在相比之下总是被当作真值（ground truth）。在生成过程中，当前单词是解码器最后一步的输出，这来自于PaddlePaddle的内存中。
-
-而且，我们使用集束搜索（Beam Search）来生成序列。集束搜索使用广度优先搜索来构建搜索树。对于树的每一层，生成当前层的所有后继状态，并将它们按照启发代价（heuristic cost）升序排列。但是这种方法在每层只保存预设数量的最优状态（这个数量称为beam size）。
-
-### 预训练的模型 ###
-我们在拥有50个节点的集群中训练模型，每个节点有两个6核CPU。我们在5天里训练了16个pass，其中每条pass花费了7个小时。model_dir中有16个子目录，每个里面都包含202MB的全部的模型参数。然后我们发现pass-00012的模型有着最高的BLEU值27.77（参考文献[BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)）。要下载解压这个模型，只需在linux下运行如下命令：
-
-```bash
-cd demo/seqToseq/data
-./wmt14_model.sh
-```
-
-### 使用PaddlePaddle生成模型 ###
-在翻译法语句子之前，我们需要创建模型配置文件。这里是一个例子`demo/seqToseq/translation/gen.conf`。前三行import了定义network，job_mode和attention_mode的python函数。
-
-```python
-from seqToseq_net import *
-is_generating = True
-
-################## Data Definiation #####################
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-############## Algorithm Configuration ##################
-settings(
-  learning_method = AdamOptimizer(),
-  batch_size = 1,
-  learning_rate = 0)
-
-################# Network configure #####################
-gru_encoder_decoder(gen_conf, is_generating)
-```
-
-1. **Data Definiation**：在示例中我们定义了一个序列到序列的生成数据。它返回gen_conf作为配置，其输入参数如下：
-  - data_dir：生成数据的目录
-  - is_generating：这个配置是否用来生成，这里设置为True
-  - gen_result：保存生成结果的文件
-2. **Algorithm Configuration**：在生成过程中我们使用SGD训练算法，并指定batch_size为1（每次生成1个序列），learning_rate为0
-3. **Network Architecture**：本质上与训练模型一样
-
-### 生成模型的命令与结果 ###
-写完模型配置之后，我们可以通过以下命令来进行从法语到英语的文本翻译：
-
-```bash
-cd demo/seqToseq/translation
-./gen.sh
-```
-
- `gen.sh` 的内容如下所示。与训练模型不同的是，这里有一些不同的参数需要指定：
-
-```bash
-paddle train \
---job=test \
---config='translation/gen.conf' \
---save_dir='data/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
-2>&1 | tee 'translation/gen.log'
-```
-- job：设置任务的模式为测试
-- save_dir：存储模型的路径
-- num_passes and test_pass：从test_pass到（num_passes - 1）加载模型参数，这里只加载 `data/wmt14_model/pass-00012`
-
-你将会看到这样的消息：
-
-    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
-    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
-    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
-    ...
-
-然后在`demo/seqToseq/translation/gen_result`中的生成结果如下所示：
-
-    0
-    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
-    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
-    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
-
-    1
-    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
-    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
-    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
-    ...
-
-- 这是集束搜索的结果，其中beam size是3
-- 第一行的“0”和第6行的“1”表示生成数据的序列id
-- 其他六行列出了集束搜索的结果
-  - 第二列是集束搜索的得分（从大到小）
-  - 第三列是生成的英语序列
-- 有两个特殊标识：
-  - `<e>`：序列的结尾
-  - `<unk>`：不包含在字典中的单词
-
-### BLEU评估 ###
-对机器翻译的人工评估工作很广泛但也很昂贵。一篇论文 [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) 展示了一种方法，当需要快速或者频繁的评估时，使用自动的替补来替代经验丰富的人工评判。[Moses](http://www.statmt.org/moses/) 是一个统计学的机器翻译系统，我们使用其中的 [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) 来做BLEU评估。运行以下命令来下载这个脚本：
-
-```bash
-cd demo/seqToseq/translation
-./moses_bleu.sh
-```
-
-由于标准的翻译结果已经下载到这里`data/wmt14/gen/ntst14.trg`，我们可以运行以下命令来做BLEU评估。
-
-```bash
-cd demo/seqToseq/translation
-./eval_bleu.sh FILE BEAMSIZE
-```
-
-- FILE：生成的结果文件
-- BEAMSIZE：集束搜索中的扩展广度
diff --git a/doc/tutorials/text_generation/index_en.md b/doc/tutorials/text_generation/index_en.md
deleted file mode 100644
index 5d8e667c20bd1fda64a6e11a88517d52112b72fa..0000000000000000000000000000000000000000
--- a/doc/tutorials/text_generation/index_en.md
+++ /dev/null
@@ -1,338 +0,0 @@
-# Text generation Tutorial #
-
-Sequence to sequence has been proven to be a powerful model for language generation. It can be used for machine translation, query rewriting, image captioning, etc.
-
-This tutorial guides you through training a sequence to sequence model for neural machine translation (NMT) network that translates French to English.
-
-We follow the paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473) , which details the model architecture and training procedure for good performance on WMT-14 dataset. This tutorial reproduces this result in PaddlePaddle.
-
-We thank @caoying for the pull request that defines the model architecture and solver configurations.
-
-## Data Preparation ##
-### Download and Extract ###
-Download the WMT-14 dataset from [http://www-lium.univ-lemans.fr/~schwenk/cslm\_joint\_paper/](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/), extract it, and divide Develop and Test data into separate folder.
-
-- **Train data**: [bitexts (after selection)](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/bitexts.tgz)
-- **Develop and Test data**: [dev+test data](http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz)
-
-To do this, simply run the following commands in linux, otherwise, you need to download, extract, divide, and rename the file suffix respectively.
-
-```bash
-cd demo/seqToseq/data
-./wmt14_data.sh
-```
-
-We should find that the dataset `wmt14` has three folders as shown in the following table.
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-<colgroup>
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-<col  class="left" />
-</colgroup>
-
-<thead>
-<tr>
-<th scope="col" class="left">folder name</th>
-<th scope="col" class="left">French-English parallel corpora file</th>
-<th scope="col" class="left">number of total file</th>
-<th scope="col" class="left">size</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">train_data</td>
-<td class="left">ccb2_pc30.src, ccb2_pc30.trg, etc</td>
-<td class="left">twelve</td>
-<td class="left">3.55G</td>
-</tr>
-
-<tr>
-<td class="left">test_data</td>
-<td class="left">ntst1213.src, ntst1213.trg</td>
-<td class="left">two</td>
-<td class="left">1636k</td>
-</tr>
-
-<tr>
-<td class="left">gen_data</td>
-<td class="left">ntst14.src, ntst14.trg</td>
-<td class="left">two</td>
-<td class="left">864k</td>
-</tr>
-</tbody>
-</table>
-<br/>
-
-- Each folder has French-English parallel corpora
-- **XXX.src** are source French files; **XXX.trg** are target English files.
-- The number of lines of **XXX.src** and **XXX.trg** should be the same.
-- Each line is a French/English sentence.
-- There is a one-to-one correspondence between the sentence at the i-th line of **XXX.src** and **XXX.trg**.
-
-### User Defined Dataset ###
-
-If you need to do other sequence-to-sequence tasks, such as Paraphrasing, you only need to organize the data as follows, and place them in `demo/seqToseq/data`:
-
-    dataset
-      train
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      test
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-      gen
-        file1.src file1.trg
-        file2.src file2.trg
-        ......
-- 1st directory: dataset folder name
-- 2nd directory: folder of train, test, and gen. The names of these three folders are fixed.
-- 3rd file: Source-Target parallel corpora files.
-  - **XXX.src** are source files, **XXX.trg** are target files.
-  - Each line of the file must be a sequence.
-  - There should be a one-to-one correspondence between the i-th sequence of **XXX.src** and **XXX.trg**.
-
-## Data Preprocess ##
-### Preprocessing Workflow ###
-- Concat each Source-Target parallel corpora to be one file:
-  - concat each **XXX.src** and **XXX.trg** to be **XXX**.
-  - the i-th line of **XXX** = the i-th line of **XXX.src** + '\t' + the i-th line of **XXX.trg**
-- Build source and target dictionary of train data, each dictionary has DICTSIZE words:
-  - the most frequent (DICTSIZE-3) words
-  - 3 special token:
-    - `<s>`: the start of a sequence
-    - `<e>`: the end of a sequence
-    - `<unk>`: a word not included in dictionary
-
-### Preprocessing Command and Result
-The general command for preprocessing the dataset is:
-
-```python
-cd demo/seqToseq/
-python preprocess.py -i INPUT [-d DICTSIZE] [-m]
-```
-
-- `-i INPUT`: the path of input original dataset
-- `-d DICTSIZE`: the specified word count of dictionary, if not set, dictionary will contain all the words in input dataset
-- `-m --mergeDict`: merge source and target dictionary, thus, two dictionaries have the same context
-
-And you will see messages like this:
-
-    concat parallel corpora for dataset
-    build source dictionary for train data
-    build target dictionary for train data
-    dictionary size is XXX
-
-Here, you can simply run the command:
-
-```python
-python preprocess.py -i data/wmt14 -d 30000
-```
-
-It will take several minutes, and store the preprocessed dataset in `demo/seqToseq/data/pre-wmt14`, the directory has following structure.
-
-    train test gen train.list test.list gen.list src.dict trg.dict
-
-- **train, test, gen**: folder contains French-English parallel corpora of train data, test data and gen data respectively. Each line of file in folder contains two parts, the former is a French sequence, and the latter is a corresponding English sequence.
-- **train.list, test.list, gen.list**: text contains a file list in train folder, test folder and gen folder respectively
-- **src.dict, trg.dict**: source (French) / target (English) dictionary, each dictionary has 30000 words: the most frequent 29997 words and 3 special token
-
-## Model Training ##
-### Introduction ###
-
-Neural machine translation (NMT) aims at building a single neural network that can be jointly tuned to maximize translation performance. Recently proposed NMT models often belong to a family of encoder–decoder models. Encoder-Decoder models encode a source sentence into a fixed-length vector from which a decoder generates a target sentence.
-
-In this task, we use an extension to the encoder–decoder model which learns to align and translate jointly. Each time the model generates a word in a translation, it searches for a set of positions in the source sentence for the most relevant information.  The decoder predicts a target word based on the context vectors associated with these source positions and all the previous generated target words. For more detailed explanation, readers can refer to paper [Neural Machine Translation by Jointly Learning to Align and Translate](http://arxiv.org/abs/1409.0473).
-
-The most distinguishing feature of this model is that it doesn't encode an input sentence into a single ﬁxed-length vector. Instead, it encodes the input sentence into a sequence of vectors, where one vector corresponds to an input element. A subset of these vectors is chosen adaptively while decoding the translated sentence. This frees a NMT model from having to squash all the information of a source sentence, regardless of its length, into a ﬁxed-length vector. The improvement of this model is more apparent for longer sentences, but the improvement can be observed for sentences of any length.
-<center>![](./encoder-decoder-attention-model.png)</center>
-<center>Figure 1. Encoder-Decoder-Attention-Model</center>
-
-### Training Model in PaddlePaddle ###
-We need to create a model config file before training. Here is an example `demo/seqToseq/translation/train.conf`. The first three lines import python function for defining network, and define the job_mode and attention_mode.
-
-```python
-from seqToseq_net import *
-is_generating = False
-
-### Data Definiation
-train_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                             is_generating = is_generating)
-
-### Algorithm Configuration
-settings(
-    learning_method = AdamOptimizer(),
-    batch_size = 50,
-    learning_rate = 5e-4)
-
-### Network Architecture
-gru_encoder_decoder(train_conf, is_generating)
-```
-
-1. **Data Definiation**: We define a SeqToSeq train and test data in our example. It returns train_conf as the configuration, following is its input arguments:
-   - data_dir: directory of train data and test data
-   - is\_generating: whether this config is used for generating, here is false
-2. **Algorithm Configuration**: We use the SGD training algorithm (default), ADAM learning method in our example, specify batch_size as 50, and learning rate as 5e-4.
-3. **Network Architecture**: We use an attention version of GRU Encoder-Decoder network in our example. It consists a bidirectional GRU as an encoder and a decoder that emulates searching through a source sentence during decoding a translation.
-
-### Training Command and Result###
-After writing the model config, we can train the model by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./train.sh
-```
-
-The `train.sh` is shown as follows:
-
-```bash
-paddle train \
---config='translation/train.conf' \
---save_dir='translation/model' \
---use_gpu=false \
---num_passes=16 \
---show_parameter_stats_period=100 \
---trainer_count=4 \
---log_period=10 \
---dot_period=5 \
-2>&1 | tee 'translation/train.log'
-```
-- config: set config of neural network
-- save_dir: set output path to save models
-- use_gpu: whether to use GPU to train, here use CPU
-- num_passes: set number of passes. One pass in paddle means training all samples in dataset one time
-- show_parameter_stats_period: here show parameter statistic every 100 batches
-- trainer_count: set number of CPU threads or GPU devices
-- log_period: here print log every 10 batches
-- dot_period: here print '.' every 5 batches
-
-The training loss function is printed every 10 batch by default, and you will see messages like this:
-
-    I0719 19:16:45.952062 15563 TrainerInternal.cpp:160]  Batch=10 samples=500 AvgCost=198.475 CurrentCost=198.475 Eval: classification_error_evaluator=0.737155  CurrentEval: classification_error_evaluator=0.737155
-    I0719 19:17:56.707319 15563 TrainerInternal.cpp:160]  Batch=20 samples=1000 AvgCost=157.479 CurrentCost=116.483 Eval: classification_error_evaluator=0.698392  CurrentEval: classification_error_evaluator=0.659065
-    .....
-- AvgCost: Average Cost from 0th batch to current batch
-- CurrentCost: Cost in current batch
-- classification\_error\_evaluator(Eval): False prediction rate for each word from 0th evaluation to current evaluation
-- classification\_error\_evaluator(CurrentEval): False prediction rate for each word in current evaluation
-
-And when the classification\_error\_evaluator is less than 0.35, the model is trained sucessfully.
-
-## Text Generation ##
-### Introduction ###
-
-Generally speaking, the NMT model is conditioned on the encodings of the source sentence, and then to predict the next target word by given the current target word. In the training process, the current word is always knowns as the ground truth, by contrast. In the generating process, the current word is the output of the decoder in last time step, which is accessed to from a memory in PaddlePaddle.
-
-Besides, we use Beam Search to generate sequences. Beam search uses breadth-first search to build its search tree. At each level of the tree, it generates all successors of the states at the current level, sorting them in increasing order of heuristic cost. However, it only stores a predetermined number of best states at each level (called the beam size).
-
-### Pretrained model ###
-We trained the model on a cluster with 50 nodes, each node has two 6-core CPUs. We trained 16 passes in 5 days, where each pass takes 7 hours. The model_dir has 16 sub-folder, each of which contains the whole model parameters with 202MB size. And we find pass-00012 model has the highest BLEU 27.77 (see paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf)). To download and extract this model, simply run the following commands in linux.
-
-```bash
-cd demo/seqToseq/data
-./wmt14_model.sh
-```
-
-### Generating Model in PaddlePaddle ###
-We need to create a model config file before translating French sequence. Here is an example `demo/seqToseq/translation/gen.conf`, the first three lines import python function for defining network, and define the job\_mode and attention\_mode.
-
-```python
-from seqToseq_net import *
-is_generating = True
-
-################## Data Definiation #####################
-gen_conf = seq_to_seq_data(data_dir = "./data/pre-wmt14",
-                           is_generating = is_generating,
-                           gen_result = "./translation/gen_result")
-
-############## Algorithm Configuration ##################
-settings(
-  learning_method = AdamOptimizer(),
-  batch_size = 1,
-  learning_rate = 0)
-
-################# Network configure #####################
-gru_encoder_decoder(gen_conf, is_generating)
-```
-
-1. **Data Definiation**: We defines an SeqToSeq gen data in our example. It returns gen_conf as the configuration, following is its input arguments:
-   - data\_dir: directory of gen data
-   - is\_generating: whether this config is used for generating, here is true
-   - gen\_result: file to store the generation result
-2. **Algorithm Configuration**: We use SGD traing algorithm in generation, and specify batch_size as 1 (each time generate one sequence), and learning rate as 0.
-3. **Network Architecture**: Essentially the same as the training model.
-
-### Generating Command and Result ###
-After writing the model config, we can do text translation from French to English by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./gen.sh
-```
-
-The `gen.sh` is shown as follows, unlike training, there are some different arguments to specify:
-
-```bash
-paddle train \
---job=test \
---config='translation/gen.conf' \
---save_dir='data/wmt14_model' \
---use_gpu=true \
---num_passes=13 \
---test_pass=12 \
---trainer_count=1 \
-2>&1 | tee 'translation/gen.log'
-```
-- job: set job mode to test
-- save_dir: the path of saved models
-- num_passes and test_pass: loading model parameters from test_pass to (num_passes - 1), here only loads `data/wmt14_model/pass-00012`
-
-You will see messages like this:
-
-    I0706 14:48:31.178915 31441 GradientMachine.cpp:143] Loading parameters from data/wmt14_model/pass-00012
-    I0706 14:48:40.012039 31441 Tester.cpp:125]  Batch=100 samples=100 AvgCost=0
-    I0706 14:48:48.898632 31441 Tester.cpp:125]  Batch=200 samples=200 AvgCost=0
-    ...
-
-And the generating result in `demo/seqToseq/translation/gen_result` likes:
-
-    0
-    0       -11.1314         The <unk> <unk> about the width of the seats while large controls are at stake <e>
-    1       -11.1519         The <unk> <unk> on the width of the seats while large controls are at stake <e>
-    2       -11.5988         The <unk> <unk> about the width of the seats while large controls are at stake . <e>
-
-    1
-    0       -24.4149         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of the Dubai <unk> . <e>
-    1       -26.9524         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s <unk> . <e>
-    2       -27.9574         The dispute is between the major aircraft manufacturers about the width of the tourist seats on the <unk> flights , paving the way for a <unk> confrontation during the month of Dubai &apos; s Dubai <unk> . <e>
-    ...
-
-- This is the beam search result, where beam size is 3
-- '0' in 1st-line and '1' in 6th-line mean the sequence-id in gen data
-- Other six lines list the beam search results
-  - The 2nd-column is the score of beam search (from large to small)
-  - The 3rd-colunm is the generating English sequence
-- There is 2 special tokens:
-  - `<e>`: the end of a sequence
-  - `<unk>`: a word not included in dictionary
-
-### Bleu Evalutaion ###
-Human evaluations of machine translation are extensive but expensive. Paper [BLEU: a Method for Automatic Evaluation of Machine Translation](http://www.aclweb.org/anthology/P02-1040.pdf) presents a method as an automated understudy to skilled human judges which substitutes for them when there is need for quick or frequent evaluations. [Moses](http://www.statmt.org/moses/) is a statistical machine translation system, and we use [multi-bleu.perl](https://github.com/moses-smt/mosesdecoder/blob/master/scripts/generic/multi-bleu.perl) of it to do Bleu Evalution. To download this script, simply run the following command:
-
-```bash
-cd demo/seqToseq/translation
-./moses_bleu.sh
-```
-
-Since the standard translation is alrealy downloaded as `data/wmt14/gen/ntst14.trg`, we can do Bleu Evalution by running the command:
-
-```bash
-cd demo/seqToseq/translation
-./eval_bleu.sh FILE BEAMSIZE
-```
-
-- FILE: the generation result file
-- BEAMSIZE: expand width in beam search
diff --git a/doc/v1_api_tutorials/README.md b/doc/v1_api_tutorials/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..071b8da61fbcab3e88819273008b4526546202ad
--- /dev/null
+++ b/doc/v1_api_tutorials/README.md
@@ -0,0 +1,5 @@
+The tutorials in v1_api_tutorials are using v1_api currently, and will be upgraded to v2_api later.
+Thus, v1_api_tutorials is a temporary directory. We decide not to maintain it and will delete it in future.
+
+Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and 
+[PaddlePaddle/models](https://github.com/PaddlePaddle/models) to learn PaddlePaddle.
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/v1_api_tutorials/embedding_model/index_cn.md
similarity index 100%
rename from doc/tutorials/embedding_model/index_cn.md
rename to doc/v1_api_tutorials/embedding_model/index_cn.md
diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/v1_api_tutorials/embedding_model/index_en.md
similarity index 100%
rename from doc/tutorials/embedding_model/index_en.md
rename to doc/v1_api_tutorials/embedding_model/index_en.md
diff --git a/doc/tutorials/embedding_model/neural-n-gram-model.png b/doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png
similarity index 100%
rename from doc/tutorials/embedding_model/neural-n-gram-model.png
rename to doc/v1_api_tutorials/embedding_model/neural-n-gram-model.png
diff --git a/doc/tutorials/gan/gan.png b/doc/v1_api_tutorials/gan/gan.png
similarity index 100%
rename from doc/tutorials/gan/gan.png
rename to doc/v1_api_tutorials/gan/gan.png
diff --git a/doc/tutorials/gan/index_en.md b/doc/v1_api_tutorials/gan/index_en.md
similarity index 100%
rename from doc/tutorials/gan/index_en.md
rename to doc/v1_api_tutorials/gan/index_en.md
diff --git a/doc/tutorials/gan/mnist_sample.png b/doc/v1_api_tutorials/gan/mnist_sample.png
similarity index 100%
rename from doc/tutorials/gan/mnist_sample.png
rename to doc/v1_api_tutorials/gan/mnist_sample.png
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/v1_api_tutorials/gan/uniform_sample.png
similarity index 100%
rename from doc/tutorials/gan/uniform_sample.png
rename to doc/v1_api_tutorials/gan/uniform_sample.png
diff --git a/doc/tutorials/imagenet_model/resnet_block.jpg b/doc/v1_api_tutorials/imagenet_model/resnet_block.jpg
similarity index 100%
rename from doc/tutorials/imagenet_model/resnet_block.jpg
rename to doc/v1_api_tutorials/imagenet_model/resnet_block.jpg
diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md
similarity index 100%
rename from doc/tutorials/imagenet_model/resnet_model_cn.md
rename to doc/v1_api_tutorials/imagenet_model/resnet_model_cn.md
diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/v1_api_tutorials/imagenet_model/resnet_model_en.md
similarity index 100%
rename from doc/tutorials/imagenet_model/resnet_model_en.md
rename to doc/v1_api_tutorials/imagenet_model/resnet_model_en.md
diff --git a/doc/tutorials/quick_start/index_cn.rst b/doc/v1_api_tutorials/quick_start/index_cn.rst
similarity index 100%
rename from doc/tutorials/quick_start/index_cn.rst
rename to doc/v1_api_tutorials/quick_start/index_cn.rst
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/v1_api_tutorials/quick_start/index_en.md
similarity index 100%
rename from doc/tutorials/quick_start/index_en.md
rename to doc/v1_api_tutorials/quick_start/index_en.md
diff --git a/doc/tutorials/quick_start/src/NetContinuous_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetContinuous_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetContinuous_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetContinuous_en.png b/doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetContinuous_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetContinuous_en.png
diff --git a/doc/tutorials/quick_start/src/NetConv_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetConv_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetConv_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetConv_en.png b/doc/v1_api_tutorials/quick_start/src/NetConv_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetConv_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetConv_en.png
diff --git a/doc/tutorials/quick_start/src/NetLR_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetLR_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetLR_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetLR_en.png b/doc/v1_api_tutorials/quick_start/src/NetLR_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetLR_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetLR_en.png
diff --git a/doc/tutorials/quick_start/src/NetRNN_cn.jpg b/doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/NetRNN_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/NetRNN_cn.jpg
diff --git a/doc/tutorials/quick_start/src/NetRNN_en.png b/doc/v1_api_tutorials/quick_start/src/NetRNN_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/NetRNN_en.png
rename to doc/v1_api_tutorials/quick_start/src/NetRNN_en.png
diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineNetwork_cn.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineNetwork_en.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineTest_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTest_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineTest_cn.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineTest_en.png b/doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTest_en.png
rename to doc/v1_api_tutorials/quick_start/src/PipelineTest_en.png
diff --git a/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/PipelineTrain_cn.jpg
diff --git a/doc/tutorials/quick_start/src/PipelineTrain_en.png b/doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png
similarity index 100%
rename from doc/tutorials/quick_start/src/PipelineTrain_en.png
rename to doc/v1_api_tutorials/quick_start/src/PipelineTrain_en.png
diff --git a/doc/tutorials/quick_start/src/Pipeline_cn.jpg b/doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/Pipeline_cn.jpg
rename to doc/v1_api_tutorials/quick_start/src/Pipeline_cn.jpg
diff --git a/doc/tutorials/quick_start/src/Pipeline_en.jpg b/doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/src/Pipeline_en.jpg
rename to doc/v1_api_tutorials/quick_start/src/Pipeline_en.jpg
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 739c4c01e02b10f46c36b997f8c4700150da2a26..f57db1c0a0107c4fd74b81aedaf4a58ff2a132ec 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -25,9 +25,8 @@ import (
 	"strings"
 	"time"
 
+	log "github.com/inconshreveable/log15"
 	"github.com/namsral/flag"
-	log "github.com/sirupsen/logrus"
-	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -41,16 +40,20 @@ func main() {
 	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
 	logLevel := flag.String("log-level", "info",
-		"log level, possible values: debug, info, warning, error, fatal, panic")
+		"log level, possible values: debug, info, warn, error, crit")
 	flag.Parse()
 
-	level, e := log.ParseLevel(*logLevel)
-	candy.Must(e)
+	lvl, err := log.LvlFromString(*logLevel)
+	if err != nil {
+		panic(err)
+	}
 
-	log.SetLevel(level)
+	log.Root().SetHandler(
+		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
 
 	if *endpoints == "" {
-		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
+		log.Warn("-endpoints not set, fault tolerance not be enabled.")
 	}
 
 	var store master.Store
@@ -58,23 +61,25 @@ func main() {
 		eps := strings.Split(*endpoints, ",")
 		ip, err := networkhelper.GetExternalIP()
 		if err != nil {
-			log.Fatal(err)
+			log.Crit("get external ip error", log.Ctx{"error": err})
+			panic(err)
 		}
 
 		addr := fmt.Sprintf("%s:%d", ip, *port)
 		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
 		if err != nil {
-			log.Fatal(err)
+			log.Crit("error creating etcd client.", log.Ctx{"error": err})
+			panic(err)
 		}
 	} else {
 		store = &master.InMemStore{}
 	}
 
 	shutdown := func() {
-		log.Infoln("shutting down gracefully")
+		log.Info("shutting down gracefully")
 		err := store.Shutdown()
 		if err != nil {
-			log.Errorln(err)
+			log.Error("shutdown error", log.Ctx{"error": err})
 		}
 	}
 
@@ -86,24 +91,28 @@ func main() {
 
 	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	if err != nil {
-		log.Fatal(err)
+		log.Crit("error creating new service.", log.Ctx{"error": err})
+		panic(err)
 	}
 
 	err = rpc.Register(s)
 	if err != nil {
-		log.Fatal(err)
+		log.Crit("error registering to etcd.", log.Ctx{"error": err})
+		panic(err)
 	}
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	if err != nil {
-		log.Fatal(err)
+		log.Crit("error listing to port", log.Ctx{"error": err, "port": *port})
+		panic(err)
 	}
 
 	go func() {
 		err = http.Serve(l, nil)
 		if err != nil {
-			log.Fatal(err)
+			log.Crit("error serving HTTP", log.Ctx{"error": err})
+			panic(err)
 		}
 	}()
 
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index bec5775d540729000ab2dd3002600f0a92619d70..90f9cf3fcf209457b2746ab746c437d82dfc65aa 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -27,11 +27,11 @@ import (
 	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 func main() {
-	port := flag.Int("port", 0, "port of the pserver")
+	port := flag.Int("port", 8001, "port of the pserver")
 	index := flag.Int("index", -1, "index of the pserver, set to -1 if use etcd for auto pserver index registry")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
@@ -41,13 +41,17 @@ func main() {
 	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
 	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
-		"log level, possible values: debug, info, warning, error, fatal, panic")
+		"log level, possible values: debug, info, warn, error, crit")
 	flag.Parse()
 
-	level, err := log.ParseLevel(*logLevel)
-	candy.Must(err)
+	lvl, err := log.LvlFromString(*logLevel)
+	if err != nil {
+		panic(err)
+	}
 
-	log.SetLevel(level)
+	log.Root().SetHandler(
+		log.LvlFilterHandler(lvl, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
 
 	var idx int
 
@@ -63,7 +67,7 @@ func main() {
 		cp, err = pserver.LoadCheckpoint(e, idx)
 		if err != nil {
 			if err == pserver.ErrCheckpointNotFound {
-				log.Infof("Could not find the pserver checkpoint.")
+				log.Info("Could not find the pserver checkpoint.")
 			} else {
 				panic(err)
 			}
@@ -71,10 +75,10 @@ func main() {
 	}
 
 	shutdown := func() {
-		log.Infoln("shutting down gracefully")
+		log.Info("shutting down gracefully")
 		sErr := e.Shutdown()
 		if sErr != nil {
-			log.Errorln(sErr)
+			log.Error("error shutting down", log.Ctx{"error": sErr})
 		}
 	}
 
@@ -95,7 +99,7 @@ func main() {
 	candy.Must(err)
 
 	go func() {
-		log.Infof("start pserver at port %d", *port)
+		log.Info("starting pserver", log.Ctx{"port": *port})
 		err = http.Serve(l, nil)
 		candy.Must(err)
 	}()
diff --git a/go/glide.lock b/go/glide.lock
index 1ecdd217520e0a62b546b4c7048a25f4316d3f37..ce654d36364f8078a493651d8d8b141532eea26d 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,6 +1,8 @@
-hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
-updated: 2017-08-07T23:37:48.867469328Z
+hash: 51d9e2e46d7fd9173ff11ecada40f7b7728756be18d5e2f032535f66465e6e15
+updated: 2017-10-24T15:04:09.987751592-07:00
 imports:
+- name: github.com/alecthomas/gometalinter
+  version: bae2f1293d092fd8167939d5108d1b025eaef9de
 - name: github.com/beorn7/perks
   version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
   subpackages:
@@ -10,7 +12,7 @@ imports:
 - name: github.com/cockroachdb/cmux
   version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: d0d1a87aa96ae14914751d42264262cb69eda170
+  version: f1d7dd87da3e8feab4aaf675b8e29c6a5ed5f58b
   subpackages:
   - alarm
   - auth
@@ -97,6 +99,8 @@ imports:
   version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
 - name: github.com/ghodss/yaml
   version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/go-stack/stack
+  version: 817915b46b97fd7bb80e8ab6b69f01a53ac3eebf
 - name: github.com/gogo/protobuf
   version: 909568be09de550ed094403c2bf8a261b5bb730a
   subpackages:
@@ -118,8 +122,14 @@ imports:
   - runtime
   - runtime/internal
   - utilities
+- name: github.com/inconshreveable/log15
+  version: 0decfc6c20d9ca0ad143b0e89dcaa20f810b4fb3
 - name: github.com/jonboulle/clockwork
   version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/mattn/go-colorable
+  version: 5411d3eea5978e6cdc258b30de592b60df6aba96
+- name: github.com/mattn/go-isatty
+  version: 57fdcb988a5c543893cc61bce354a6e24ab70022
 - name: github.com/matttproud/golang_protobuf_extensions
   version: c12348ce28de40eed0136aa2b644d0ee0650e56c
   subpackages:
@@ -149,7 +159,7 @@ imports:
 - name: github.com/satori/go.uuid
   version: 879c5887cd475cd7864858769793b2ceb0d44feb
 - name: github.com/sirupsen/logrus
-  version: a3f95b5c423586578a4e099b11a46c2479628cac
+  version: f006c2ac4710855cf0f916dd6b77acf6b048dc6e
 - name: github.com/topicai/candy
   version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
 - name: github.com/ugorji/go
@@ -159,12 +169,13 @@ imports:
 - name: github.com/xiang90/probing
   version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
 - name: golang.org/x/crypto
-  version: 1351f936d976c60a0a48d728281922cf63eafb8d
+  version: 9419663f5a44be8b34ca85f08abc5fe1be11f8a3
   repo: https://github.com/golang/crypto.git
   vcs: git
   subpackages:
   - bcrypt
   - blowfish
+  - ssh/terminal
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -176,11 +187,12 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: 0f826bdd13b500be0f1d4004938ad978fcc6031e
+  version: e48874b42435b4347fc52bdee0424a52abc974d7
   repo: https://github.com/golang/sys.git
   vcs: git
   subpackages:
   - unix
+  - windows
 - name: golang.org/x/text
   version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
   repo: https://github.com/golang/text.git
diff --git a/go/glide.yaml b/go/glide.yaml
index a90e71b615de92d64c79823e2a04c46001963932..ba253f8bebef0ddab810a8303ab1fbe541defbdf 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -24,3 +24,9 @@ import:
   vcs: git
 - package: github.com/satori/go.uuid
   version: v1.1.0
+- package: github.com/alecthomas/gometalinter
+  version: v1.2.1
+- package: github.com/inconshreveable/log15
+  version: v2.13
+- package: github.com/go-stack/stack
+  version: v1.6.0
diff --git a/go/master/c/client.go b/go/master/c/client.go
index b5759c30b1d7f7dc33e162e959c7de165e02e1da..9a59337108d1aa33929abb480af686a96514655b 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -35,13 +35,19 @@ import (
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
 
+func init() {
+	log.Root().SetHandler(
+		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
+}
+
 func add(c *master.Client) C.paddle_master_client {
 	mu.Lock()
 	defer mu.Unlock()
@@ -117,7 +123,7 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	}
 	err := c.SetDataset(paths)
 	if err != nil {
-		log.Errorln(err)
+		log.Error("error set dataset", log.Ctx{"error": err})
 		return C.PADDLE_MASTER_ERROR
 	}
 
@@ -167,7 +173,7 @@ func paddle_request_save_model(client C.paddle_master_client, trainerID string,
 	c := get(client)
 	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
 	if err != nil {
-		log.Errorln(err)
+		log.Error("error request save model", log.Ctx{"error": err})
 		return C.PADDLE_MASTER_ERROR
 	}
 
diff --git a/go/master/client.go b/go/master/client.go
index f04cf50ce3cf765a79cbe555d3edb68f3dbb911e..5d657548c9039dfdacf61dd1145deb9777596d9f 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -21,7 +21,7 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
 	"github.com/coreos/etcd/clientv3"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 // Client is the client of the master server.
@@ -75,7 +75,7 @@ func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
 		for {
 			err := f()
 			if err != nil {
-				log.Warningln(err)
+				log.Warn("create etcd client error", log.Ctx{"error": err})
 			} else {
 				break
 			}
@@ -135,13 +135,13 @@ func (c *Client) getRecords(passID int) {
 				time.Sleep(time.Second * 3)
 				continue
 			}
-			log.Errorf("getTask error: %s", err)
+			log.Error("getTask error.", log.Ctx{"error": err})
 		}
 
 		for _, chunk := range t.Chunks {
 			f, e := os.Open(chunk.Path)
 			if e != nil {
-				log.Errorln(e)
+				log.Error("error open chunk", log.Ctx{"error": e})
 				continue
 			}
 
@@ -152,12 +152,15 @@ func (c *Client) getRecords(passID int) {
 
 			if s.Err() != nil {
 				c.ch <- record{nil, s.Err()}
-				log.Errorln(err, chunk.Path)
+				log.Error(
+					"error scan chunk",
+					log.Ctx{"error": err, "path": chunk.Path},
+				)
 			}
 
 			err = f.Close()
 			if err != nil {
-				log.Errorln(err)
+				log.Error("error close record file", log.Ctx{"error": err})
 			}
 		}
 
@@ -166,7 +169,7 @@ func (c *Client) getRecords(passID int) {
 		// correct, but a reasonable approximation.
 		err = c.taskFinished(t.Meta.ID)
 		if err != nil {
-			log.Errorln(err)
+			log.Error("task finish callback error.", log.Ctx{"error": err})
 		}
 	}
 }
@@ -179,12 +182,12 @@ func (c *Client) monitorMaster(addrCh <-chan string) {
 			if curMaster == "" {
 				err := c.conn.Close()
 				if err != nil {
-					log.Errorln(err)
+					log.Error("close old master addr error", log.Ctx{"error": err})
 				}
 			} else {
 				err := c.conn.Connect(curMaster)
 				if err != nil {
-					log.Errorln(err)
+					log.Error("connect to new master addr error", log.Ctx{"error": err})
 
 					// connect to addr failed, set
 					// to last known addr in order
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index d5f3d79464655540a29eaa6395057aa5795c4615..2f13fd0dcda85ee10669133ed011f47ce418b61c 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -25,8 +25,6 @@ import (
 	"testing"
 	"time"
 
-	log "github.com/sirupsen/logrus"
-
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
 )
@@ -36,10 +34,6 @@ const (
 	chunkPerTask = 10
 )
 
-func init() {
-	log.SetLevel(log.ErrorLevel)
-}
-
 func TestGetFinishTask(t *testing.T) {
 	const path = "/tmp/master_client_test_0"
 
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index 94848d887e8bc4b055a7c8b89b9b7f26a39229d1..2a41d36949cb19d9076c0ed00c8db6e235f1296c 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -20,7 +20,7 @@ import (
 
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 const (
@@ -44,7 +44,7 @@ type EtcdClient struct {
 
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
-	log.Debugf("Connecting to etcd at %v", endpoints)
+	log.Debug("Connecting to etcd", log.Ctx{"endpoint": endpoints})
 	cli, err := clientv3.New(clientv3.Config{
 		Endpoints:   endpoints,
 		DialTimeout: dialTimeout,
@@ -64,12 +64,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	// one master running, but split-brain problem may cause
 	// multiple master servers running), and the cluster management
 	// software will kill one of them.
-	log.Infof("Trying to acquire lock at %s.", lockPath)
+	log.Info("Trying to acquire lock.", log.Ctx{"path": lockPath})
 	err = lock.Lock(context.TODO())
 	if err != nil {
 		return nil, err
 	}
-	log.Infof("Successfully acquired lock at %s.", lockPath)
+	log.Info("Successfully acquired lock at %s.", log.Ctx{"path": lockPath})
 
 	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
@@ -78,7 +78,8 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 
 	if !resp.Succeeded {
-		log.Fatal("No longer owns the master lock. Exiting.")
+		log.Crit("No longer owns the master lock. Exiting.")
+		panic("No longer owns the master lock. Exiting.")
 	}
 
 	e := &EtcdClient{
@@ -102,7 +103,7 @@ func (e *EtcdClient) Save(state []byte) error {
 	}
 
 	if !resp.Succeeded {
-		log.Errorln("No longer owns the lock, trying to lock again")
+		log.Error("No longer owns the lock, trying to lock again")
 		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
 		err := e.lock.Lock(ctx)
 		cancel()
@@ -116,9 +117,10 @@ func (e *EtcdClient) Save(state []byte) error {
 			// to kill current master server. The current
 			// state is not saved, but the trainer's RPC
 			// call will fail, so the trainer will retry.
-			log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
+			log.Crit("Could not acquire the lock at %s: %v. Exiting.", log.Ctx{"path": e.lockPath, "error": err})
+			panic("Could not acquire the lock at %s: %v. Exiting.")
 		}
-		log.Infof("Successfully acquired lock at %s.", e.lockPath)
+		log.Info("Successfully acquired lock at %s.", e.lockPath)
 		return e.Save(state)
 	}
 
@@ -136,7 +138,7 @@ func (e *EtcdClient) Load() ([]byte, error) {
 	}
 
 	if !resp.Succeeded {
-		log.Errorln("No longer owns the lock, trying to lock and load again.")
+		log.Error("No longer owns the lock, trying to lock and load again.")
 		err = e.lock.Lock(context.Background())
 		if err != nil {
 			return nil, err
@@ -163,7 +165,7 @@ func (e *EtcdClient) Shutdown() error {
 		if err == nil {
 			err = newErr
 		} else {
-			log.Errorln(newErr)
+			log.Error("shutdown error", log.Ctx{"error": newErr})
 		}
 	}
 
@@ -192,7 +194,7 @@ func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
 	for wresp := range rch {
 		for _, ev := range wresp.Events {
 			// if received event is DELETE, the value will be an empty string
-			log.Infof("received event %s, %q : %q\n", ev.Type, ev.Kv.Key, ev.Kv.Value)
+			log.Info("received event.", log.Ctx{"type": ev.Type, "key": ev.Kv.Key, "value": ev.Kv.Value})
 			valChan <- string(ev.Kv.Value)
 		}
 	}
diff --git a/go/master/service.go b/go/master/service.go
index df7c6860e6ae13a5be7d0425273812208685ee9d..f3501028800c850a521d4b08db323cb70fe926d2 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -25,7 +25,7 @@ import (
 	"sync"
 	"time"
 
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 
 	"github.com/PaddlePaddle/recordio"
 )
@@ -170,11 +170,11 @@ func (s *Service) recover() (bool, error) {
 	}
 
 	if state == nil {
-		log.Infoln("No state exists, not recovered.")
+		log.Info("No state exists, not recovered.")
 		return false, nil
 	}
 
-	log.Infof("Loaded snapshot of size: %d bytes.", len(state))
+	log.Info("Loaded snapshot.", log.Ctx{"size": len(state)})
 	gr, err := gzip.NewReader(bytes.NewReader(state))
 	if err != nil {
 		return false, err
@@ -191,11 +191,11 @@ func (s *Service) recover() (bool, error) {
 	if err != nil {
 		// Only close failed, recover actually succeed, so
 		// just log error.
-		log.Errorln(err)
+		log.Error("error close recover file.", log.Ctx{"error": err})
 	}
 
 	s.state = tqs
-	log.WithFields(s.logFields()).Infof("Master recovered from snapshot, scheduling pending task timeout check.")
+	log.Info("Master recovered from snapshot, scheduling pending task timeout check.", s.logCtx())
 	for _, t := range s.state.Pending {
 		time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	}
@@ -224,7 +224,7 @@ func (s *Service) snapshot() error {
 	}
 
 	state := buf.Bytes()
-	log.Infof("Saving snapshot of size: %d bytes.", len(state))
+	log.Info("Saving snapshot.", log.Ctx{"size bytes": len(state)})
 	return s.store.Save(state)
 }
 
@@ -260,7 +260,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 
 		count := index.NumChunks()
-		log.Infof("readChunks: file %s has %d chunks", path, count)
+		log.Info("reading chunks.", log.Ctx{"path": path, "num chunks": count})
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -300,7 +300,7 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error {
 
 	err = s.snapshot()
 	if err != nil {
-		log.Errorln(err)
+		log.Error("snapshot error", log.Ctx{"error": err})
 		return err
 	}
 	close(s.ready)
@@ -320,7 +320,7 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	defer func() {
 		err := s.snapshot()
 		if err != nil {
-			log.Errorln(err)
+			log.Error("snapshot error", log.Ctx{"error": err})
 		}
 	}()
 
@@ -328,12 +328,12 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 
 	t.NumFailure++
 	if t.NumFailure > s.failureMax {
-		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+		log.Warn("Task failed to many times, discard.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
 		s.state.Failed = append(s.state.Failed, t)
 		return
 	}
 
-	log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
+	log.Warn("Task failed, re-dispatch.", log.Ctx{"task": t.Task, "num failed": t.NumFailure})
 	s.state.Todo = append(s.state.Todo, t)
 	return
 }
@@ -353,8 +353,8 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 }
 
 // must be called with lock held.
-func (s *Service) logFields() log.Fields {
-	return log.Fields{
+func (s *Service) logCtx() log.Ctx {
+	return log.Ctx{
 		"todoLen":    len(s.state.Todo),
 		"pendingLen": len(s.state.Pending),
 		"doneLen":    len(s.state.Done),
@@ -383,10 +383,10 @@ func (s *Service) GetTask(passID int, task *Task) error {
 
 	if len(s.state.Todo) == 0 {
 		if len(s.state.Done) == 0 && len(s.state.Pending) == 0 {
-			log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
+			log.Warn("All tasks failed, may start next pass", s.logCtx())
 			return ErrAllTaskFailed
 		}
-		log.WithFields(s.logFields()).Warningln("No more available task.")
+		log.Warn("No more available task.", s.logCtx())
 		return ErrNoMoreAvailable
 	}
 
@@ -400,8 +400,9 @@ func (s *Service) GetTask(passID int, task *Task) error {
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
-
+	ctx := s.logCtx()
+	ctx["task meta"] = t.Task.Meta
+	log.Info("Task dispatched.", ctx)
 	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
@@ -417,7 +418,9 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	t, ok := s.state.Pending[taskID]
 	if !ok {
-		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
+		ctx := s.logCtx()
+		ctx["task id"] = taskID
+		log.Warn("Pending task not found.", ctx)
 		return nil
 	}
 
@@ -426,7 +429,9 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	s.state.Done = append(s.state.Done, t)
 	delete(s.state.Pending, taskID)
 
-	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
+	ctx := s.logCtx()
+	ctx["task id"] = taskID
+	log.Info("Task finished.", ctx)
 	if len(s.state.Todo) == 0 && len(s.state.Pending) == 0 {
 		// increase master side pass count if all tasks finished
 		s.state.CurPass++
@@ -434,12 +439,14 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 		s.state.Done = []taskEntry{}
 		// TODO(typhoonzero): deal with failed tasks
 		s.state.Failed = []taskEntry{}
-		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.state.CurPass)
+		ctx := s.logCtx()
+		ctx["new pass"] = s.state.CurPass
+		log.Warn("all task finished, add new pass data.", ctx)
 	}
 
 	err := s.snapshot()
 	if err != nil {
-		log.Errorln(err)
+		log.Error("snapshot error", log.Ctx{"error": err})
 	}
 	return err
 }
@@ -455,7 +462,7 @@ func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 
 	t, ok := s.state.Pending[meta.ID]
 	if !ok {
-		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
+		log.Warn("TaskFailed:Pending task not found.", log.Ctx{"task": t.Task.Meta})
 		return nil
 	}
 
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index a49cd01522b8b49a74f21fcb97e9eeb1fbb2d272..2eeec1b6b3c28556e02780e40ae5d6b693dce484 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -45,9 +45,15 @@ import (
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/PaddlePaddle/Paddle/go/pserver/client"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
+func init() {
+	log.Root().SetHandler(
+		log.LvlFilterHandler(log.LvlWarn, log.CallerStackHandler("%+v", log.StderrHandler)),
+	)
+}
+
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -164,10 +170,13 @@ func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter,
 
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
+			log.Warn(
+				"parameter already initialized, treat paddle_init_param as successful.",
+				log.Ctx{"parameter": name},
+			)
 			return C.PSERVER_OK
 		}
-		log.Errorln(err)
+		log.Error("error init param", log.Ctx{"error": err})
 		return C.PSERVER_ERROR
 	}
 
@@ -180,11 +189,11 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
+			log.Warn("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
 
-		log.Errorln(err)
+		log.Error("error finish init params", log.Ctx{"error": err})
 		return C.PSERVER_ERROR
 	}
 
@@ -205,7 +214,7 @@ func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient
 	c := get(client)
 	err := c.SendGrads(gs)
 	if err != nil {
-		log.Errorln(err)
+		log.Error("error send grads", log.Ctx{"error": err})
 		return C.PSERVER_ERROR
 	}
 
@@ -222,7 +231,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
-		log.Errorln(err)
+		log.Error("error get params", log.Ctx{"error": err})
 		return C.PSERVER_ERROR
 	}
 
@@ -231,7 +240,13 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		for i, p := range ps {
 			pn[i] = p.Name
 		}
-		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		log.Error(
+			"pserver returned wrong number of parameters.",
+			log.Ctx{
+				"Requested": strings.Join(pn, ", "),
+				"Returned":  strings.Join(ns, ", "),
+			},
+		)
 		return C.PSERVER_ERROR
 	}
 
@@ -241,7 +256,13 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 			for i, p := range ps {
 				pn[i] = p.Name
 			}
-			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			log.Error(
+				"pserver returned wrong parameters, or not in requested order.",
+				log.Ctx{
+					"Requested": strings.Join(pn, ", "),
+					"Returned":  strings.Join(ns, ", "),
+				},
+			)
 			return C.PSERVER_ERROR
 		}
 	}
@@ -251,13 +272,19 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 
 		if unsafe.Pointer(param) == nil {
-			log.Errorln("must pre-allocate parameter.")
+			log.Error("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
 
 		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
-				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
+				log.Error(
+					"the pre-allocated content len does not match parameter content len.",
+					log.Ctx{
+						"Pre-allocated len": param.content_len,
+						"Returned len":      len(p.Content),
+					},
+				)
 				return C.PSERVER_ERROR
 			}
 		}
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index 20d91e77034e1a0c6825bc401175e6dc1afec52f..18fce34b376a8f60900700c588e30f92ef3514ed 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -22,7 +22,7 @@ import (
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/Paddle/go/pserver"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 // TODO(helin): add RPC call retry logic
@@ -84,7 +84,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			if curServers[i].Addr == "" {
 				err := c.pservers[i].Close()
 				if err != nil {
-					log.Errorln(err)
+					log.Error("error closing connection to pserver", log.Ctx{"error": err})
 				}
 
 				continue
@@ -92,7 +92,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 
 			err := c.pservers[i].Connect(curServers[i].Addr)
 			if err != nil {
-				log.Errorln(err)
+				log.Error("error connecting to pserver", log.Ctx{"error": err})
 
 				// connect to addr failed, set
 				// to last known addr in order
@@ -137,7 +137,7 @@ func (c *Client) FinishInitParams() error {
 			return err
 		}
 	}
-	return nil
+	return c.sel.Done()
 }
 
 // SendGrads sends gradients to parameter servers for updating
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index c3d88e926d7cb5f3027be26a270bee6f2db65f31..ec832305ee8e24967b06b6b621c44cde30c09e55 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -30,7 +30,7 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/PaddlePaddle/Paddle/go/pserver/client"
 	"github.com/coreos/etcd/clientv3"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 const (
@@ -90,7 +90,7 @@ func initEtcdClient() {
 		DialTimeout: time.Second * time.Duration(1),
 	})
 	if err != nil {
-		log.Errorf("err %v", err)
+		log.Error("error init etcd client", log.Ctx{"error": err})
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err = client.Delete(ctx, pserver.PsDesired)
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index f9071caaa8f5ac32d426b1d4344a30262202b96d..16d0c3b943050f05c54a3e010054fd7c2f33b6d6 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -25,7 +25,7 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 const (
@@ -54,26 +54,29 @@ func (e *Etcd) Desired() int {
 		resp, err := e.client.Get(ctx, pserver.PsDesired)
 		cancel()
 		if err != nil {
-			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
+			log.Error(
+				"Get ps dresire number failed! reconnecting...",
+				log.Ctx{"error": err},
+			)
 			time.Sleep(e.timeout)
 			continue
 		}
 
 		kvs := resp.Kvs
 		if len(kvs) == 0 {
-			log.Infoln("Waiting for ps desired registered ...")
+			log.Info("Waiting for ps desired registered ...")
 			time.Sleep(e.timeout)
 			continue
 		}
 
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %d invalid %v", psDesired, err)
+			log.Error("atoi failed", log.Ctx{"error": err})
 			time.Sleep(e.timeout)
 			continue
 		}
 
-		log.Debugf("Get psDesired number: %d", psDesired)
+		log.Debug("Got psDesired", log.Ctx{"psDesired": psDesired})
 		break
 	}
 	return psDesired
@@ -88,17 +91,20 @@ func (e *Etcd) List() []Server {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 			psKey := pserver.PsPath + strconv.Itoa(i)
-			log.Debugf("checking %s", psKey)
+			log.Debug("looking for pserver", log.Ctx{"ps key": psKey})
 			resp, err := e.client.Get(ctx, psKey)
 			cancel()
 			if err != nil {
-				log.Infof("Get psKey= %s error, %v", psKey, err)
+				log.Info(
+					"Get psKey error",
+					log.Ctx{"ps key": psKey, "error": err},
+				)
 				time.Sleep(e.timeout)
 				continue
 			}
 			kvs := resp.Kvs
 			if len(kvs) == 0 {
-				log.Infof("Waiting for ps addr registered ...")
+				log.Info("Waiting for ps addr registered ...")
 				time.Sleep(e.timeout)
 				continue
 			}
@@ -106,11 +112,17 @@ func (e *Etcd) List() []Server {
 			psAddr := string(resp.Kvs[0].Value)
 			// TODO(Longfei) check the ps address
 			if psAddr == "" {
-				log.Infof("Get psKey = %s, psAddr is empty", psKey)
+				log.Info(
+					"Value under psKey is empty",
+					log.Ctx{"psKey": psKey},
+				)
 				time.Sleep(e.timeout)
 				continue
 			}
-			log.Debugf("got value (%s) for key: %s", psAddr, psKey)
+			log.Debug(
+				"got psAddr given psKey",
+				log.Ctx{"psAddr": psAddr, "psKey": psKey},
+			)
 			servers[i].Index = i
 			servers[i].Addr = psAddr
 		}
@@ -130,13 +142,13 @@ func NewEtcd(endpoints string) *Etcd {
 			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
-			log.Errorf("Init etcd connection failed: %v", err)
+			log.Error("Init etcd connection failed", log.Ctx{"error": err})
 			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
 	}
-	log.Infof("Connected to etcd: %s\n", endpoints)
+	log.Info("Connected to etcd endpoint", log.Ctx{"endpoint": endpoints})
 	client := &Etcd{
 		client:    cli,
 		timeout:   defaultEtcdTimeout,
@@ -154,7 +166,7 @@ func (e *Etcd) Select() (bool, error) {
 	}
 
 	lock := concurrency.NewMutex(sess, initLockPath)
-	log.Infof("Trying to acquire lock at %s.", initLockPath)
+	log.Info("Trying to acquire lock", log.Ctx{"lock path": initLockPath})
 	// Do not use timeout context here, since we don't know how
 	// long does it take for other trainers to initialize the
 	// parameters.
@@ -162,7 +174,7 @@ func (e *Etcd) Select() (bool, error) {
 	if err != nil {
 		return false, err
 	}
-	log.Infof("Successfully acquired lock at %s.", initLockPath)
+	log.Info("Successfully acquired lock", log.Ctx{"lock path": initLockPath})
 
 	get := clientv3.OpGet(initDonePath)
 	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
@@ -181,17 +193,17 @@ func (e *Etcd) Select() (bool, error) {
 	if len(resp.Kvs) == 0 {
 		// Key value not set, select current trainer.
 		e.lock = lock
-		log.Infoln("Trainer selected.")
+		log.Info("Trainer selected.")
 		return true, nil
 	}
 
 	if string(resp.Kvs[0].Value) == initDoneVal {
-		log.Infoln("Initialization is already done.")
+		log.Info("Initialization is already done.")
 		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
 		err = lock.Unlock(ctx)
 		cancel()
 		if err != nil {
-			log.Errorln(err)
+			log.Error("error unlocking", log.Ctx{"error": err})
 		}
 		return false, nil
 	}
@@ -221,7 +233,7 @@ func (e *Etcd) Done() error {
 	err = e.lock.Unlock(ctx)
 	cancel()
 	if err != nil {
-		log.Errorln(err)
+		log.Error("error unlocking", log.Ctx{"error": err})
 	} else {
 		e.lock = nil
 	}
@@ -244,7 +256,7 @@ func (e *Etcd) Close() error {
 	cErr := e.client.Close()
 	if cErr != nil {
 		if err != nil {
-			log.Errorln(cErr)
+			log.Error("error closing etcd client", log.Ctx{"error": cErr})
 			return err
 		}
 		return cErr
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 41f0640fc09a3265c0e11c06255c7ee834983203..08ddb247f26379da80d485b1a6059f793864b786 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -24,7 +24,7 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/clientv3/concurrency"
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 const (
@@ -82,19 +82,19 @@ func (e *EtcdClient) Register(port int) (int, error) {
 			DialTimeout: e.dialTimeout,
 		})
 		if err != nil {
-			log.Errorf("connect to etcd error: %v", err)
+			log.Error("connect to etcd error", log.Ctx{"error": err})
 			time.Sleep(retryTimeout)
 			continue
 		}
 		e.client = cli
 		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
 		if err != nil {
-			log.Errorf("create etcd session error: %v", err)
+			log.Error("create etcd session error", log.Ctx{"error": err})
 			time.Sleep(retryTimeout)
 			continue
 		}
 		e.sess = sess
-		log.Debugf("inited client to %s", e.endpoints)
+		log.Debug("connected to etcd", log.Ctx{"endpoint": e.endpoints})
 		break
 	}
 	// init /ps_desired using transaction, for multiple pservers may want to write
@@ -104,7 +104,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
 		_, err := e.initDesiredPservers(ctx, e.numPservers)
 		cancel()
 		if err != nil {
-			log.Warn(err)
+			log.Warn("pserver init error", log.Ctx{"error": err, "num pservers": e.numPservers})
 			time.Sleep(retryTimeout)
 			continue
 		}
@@ -119,14 +119,17 @@ func (e *EtcdClient) Register(port int) (int, error) {
 		resp, err := e.client.Get(ctx, PsDesired)
 		cancel()
 		if err != nil {
-			log.Errorf("getting %s error: %v", PsDesired, err)
+			log.Error("get etcd key error", log.Ctx{"key": PsDesired, "error": err})
 			time.Sleep(retryTimeout)
 			continue
 		}
 		if len(resp.Kvs) != 0 {
 			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 			if err != nil {
-				log.Errorf("value of %s invalid %v\n", PsDesired, err)
+				log.Error(
+					"psDesired atoi error",
+					log.Ctx{"error": err, "value": string(resp.Kvs[0].Value)},
+				)
 				time.Sleep(retryTimeout)
 				// NOTE: wait util ps_desired value change
 				continue
@@ -143,7 +146,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
 		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
-			log.Warn(err)
+			log.Warn("register pserver on etcd error", log.Ctx{"error": err})
 			time.Sleep(retryTimeout)
 			continue
 		}
@@ -170,16 +173,17 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
 		registered := false
 		for i := 0; i < e.desired; i++ {
 			psKey := PsPath + strconv.Itoa(i)
-			log.Debugf("checking %s", psKey)
 			ps := c.Get(psKey)
-			log.Debugf("got value (%s) for key: %s", ps, psKey)
+			log.Debug(
+				"register pserver got value",
+				log.Ctx{"value": ps, "key": psKey},
+			)
 
 			if ps == "" {
 				// find the first id and write info
 				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
 				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
-				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
-				log.Debug("register finished")
+				log.Debug("register finished", log.Ctx{"key": psKey, "value": pserverAddr})
 				idx = i
 				registered = true
 				break
@@ -239,7 +243,7 @@ func (e *EtcdClient) Shutdown() error {
 		newErr := e.client.Close()
 		if newErr != nil {
 			if err != nil {
-				log.Errorln(newErr)
+				log.Error("shutdown error", log.Ctx{"error": newErr})
 			} else {
 				err = newErr
 			}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index ae7359073494bd9cb6b70b12af4daca064179556..e04c86de0a9317a63bbf3216ee32091ab564e369 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -25,7 +25,7 @@ import (
 	"fmt"
 	"unsafe"
 
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 type optimizer struct {
@@ -56,12 +56,12 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer
 	c := paramWithConfigs.Config
 	s := State
 	paramBufferSize := C.size_t(len(p.Content))
-	log.WithFields(log.Fields{
+	log.Info("New Optimizer Created with config", log.Ctx{
 		"ElementType": p.ElementType,
 		"ParamSize":   paramBufferSize,
 		"ConfigSize":  len(c),
 		"StateSize":   len(s),
-	}).Info("New Optimizer Created with config:")
+	})
 	var cbuffer unsafe.Pointer
 	cbuffer = C.malloc(paramBufferSize)
 
@@ -72,21 +72,34 @@ func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer
 	}
 
 	o.config = c
-	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
+	o.opt = C.paddle_create_optimizer(
+		(*C.uchar)(&c[0]),
+		C.int(len(c)),
+		C.paddle_element_type(p.ElementType),
+		cbuffer,
+		C.int(paramBufferSize),
+		(*C.char)(cstate),
+		C.int(len(s)),
+	)
 	return o
 }
 
 func (o *optimizer) GetWeights() []byte {
 	var buffer unsafe.Pointer
+	// we do not own the buffer, no need to free later.
 	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
 	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
 
 func (o *optimizer) GetStates() []byte {
 	var cbuffer *C.char
+	// we owns the state buffer, need to free later.
 	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
-	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+	buf := cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+	cpy := make([]byte, len(buf))
+	copy(cpy, buf)
+	C.free(unsafe.Pointer(cbuffer))
+	return cpy
 }
 
 func (o *optimizer) UpdateParameter(g Gradient) error {
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index d001e6993e6aed2f5829c1b86928af30f4900c8a..565f56dc286d214e7e9a3ddce389d92d21569cd5 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -15,8 +15,12 @@
 package pserver
 
 import (
+	"encoding/binary"
 	"io/ioutil"
+	"math"
 	"testing"
+
+	"github.com/stretchr/testify/assert"
 )
 
 func TestOptimizerCreateRelease(t *testing.T) {
@@ -36,3 +40,39 @@ func TestOptimizerCreateRelease(t *testing.T) {
 	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
+
+func float32Bytes(float float32) []byte {
+	bits := math.Float32bits(float)
+	bytes := make([]byte, 4)
+	binary.LittleEndian.PutUint32(bytes, bits)
+	return bytes
+}
+
+func TestOptimizerState(t *testing.T) {
+	p := Parameter{
+		Name:        "a",
+		ElementType: Int32,
+	}
+	weights := float32Bytes(100)
+	p.Content = weights
+	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
+	if err != nil {
+		t.Fatalf("read optimizer proto failed")
+	}
+	param := ParameterWithConfig{
+		Param:  p,
+		Config: config,
+	}
+	o := newOptimizer(param, nil)
+	s := o.GetStates()
+
+	// clear param content and check if the state is restored.
+	param.Param.Content = float32Bytes(300)
+	o1 := newOptimizer(param, s)
+	s1 := o1.GetStates()
+	assert.Equal(t, s, s1)
+	assert.Equal(t, weights, o.GetWeights())
+	assert.Equal(t, weights, o1.GetWeights())
+	o.Cleanup()
+	o1.Cleanup()
+}
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 25751540a9a2dff043c14e0912bfab1aaa938ab4..6f66faaf27bf41133783888369ed9b4cec7edea0 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -32,7 +32,7 @@ import (
 
 	uuid "github.com/satori/go.uuid"
 
-	log "github.com/sirupsen/logrus"
+	log "github.com/inconshreveable/log15"
 )
 
 // ElementType is the type of elements of a Parameter.
@@ -124,6 +124,9 @@ func loadMeta(e *EtcdClient, idx int) (meta checkpointMeta, err error) {
 
 // LoadCheckpoint loads checkpoint from file.
 func LoadCheckpoint(e *EtcdClient, idx int) (Checkpoint, error) {
+	log.Info("Loading checkpoint", "pserver index", idx)
+	defer traceTime(time.Now(), "load checkpoint")
+
 	cpMeta, err := loadMeta(e, idx)
 	if err != nil {
 		return nil, err
@@ -178,6 +181,7 @@ func NewService(idx int, interval time.Duration, path string, client *EtcdClient
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
 	select {
 	case <-s.initialized:
+		log.Warn("init param called but parameters already initialized.")
 		return errors.New(AlreadyInitialized)
 	default:
 	}
@@ -191,6 +195,13 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
 	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
+	log.Info(
+		"init parameter",
+		"name", paramWithConfigs.Param.Name,
+		"config len", len(paramWithConfigs.Config),
+		"param len", len(paramWithConfigs.Param.Content),
+		"type", paramWithConfigs.Param.ElementType,
+	)
 	return nil
 }
 
@@ -199,6 +210,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error
 func (s *Service) FinishInitParams(_ int, _ *int) error {
 	select {
 	case <-s.initialized:
+		log.Warn("finished init param called but parameters already initialized.")
 		return errors.New(AlreadyInitialized)
 	default:
 	}
@@ -209,10 +221,12 @@ func (s *Service) FinishInitParams(_ int, _ *int) error {
 		for range t {
 			err := s.checkpoint()
 			if err != nil {
-				log.Errorln(err)
+				log.Error("finish init params error", log.Ctx{"error": err})
 			}
 		}
 	}()
+
+	log.Info("init parameter finished.")
 	return nil
 }
 
@@ -222,6 +236,7 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
+		log.Warn("received gradient before initialization.", "name", g.Name, "size", len(g.Content), "type", g.ElementType)
 		return errors.New(Uninitialized)
 	}
 
@@ -233,6 +248,7 @@ func (s *Service) SendGrad(g Gradient, _ *int) error {
 		return fmt.Errorf("parameter: %s does not exist", g.Name)
 	}
 
+	log.Info("received gradient from trainer, updating gradient.", "name", g.Name, "size", len(g.Content), "type", g.ElementType)
 	return o.UpdateParameter(g)
 }
 
@@ -244,6 +260,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 
 	opt, ok := s.optMap[name]
 	if !ok {
+		log.Warn("trainer wants to get a parameter that does not exist.", "name", name)
 		return fmt.Errorf("parameter: %s does not exist", name)
 	}
 
@@ -257,12 +274,13 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
+	log.Info("sending parameter to the trainer", "name", parameter.Name, "size", len(parameter.Content), "type", parameter.ElementType)
 	return nil
 }
 
 func traceTime(start time.Time, name string) {
 	elapsed := time.Since(start)
-	log.Infof("%s took %v", name, elapsed)
+	log.Info("time elapsed", log.Ctx{"name": name, "elapsed": elapsed})
 }
 
 // checkpoint saves checkpoint to disk.
@@ -270,7 +288,7 @@ func traceTime(start time.Time, name string) {
 // checkpoint should be only called after the parameters are
 // initialized.
 func (s *Service) checkpoint() (err error) {
-	log.Infoln("Begin save checkpoint.")
+	log.Info("Begin save checkpoint.")
 	defer traceTime(time.Now(), "save checkpoint")
 
 	s.mu.Lock()
@@ -297,6 +315,13 @@ func (s *Service) checkpoint() (err error) {
 		return
 	}
 
+	if _, err = os.Stat(s.checkpointPath); os.IsNotExist(err) {
+		err = os.MkdirAll(s.checkpointPath, os.ModePerm)
+		if err != nil {
+			return
+		}
+	}
+
 	id := uuid.NewV4().String()
 	p := path.Join(s.checkpointPath, id)
 	f, err := os.Create(p)
@@ -308,7 +333,7 @@ func (s *Service) checkpoint() (err error) {
 		closeErr := f.Close()
 		if closeErr != nil {
 			if err != nil {
-				log.Errorln(closeErr)
+				log.Error("error close checkpoint file", log.Ctx{"error": closeErr})
 			} else {
 				// Set closeErr as return value.
 				err = closeErr
@@ -329,7 +354,7 @@ func (s *Service) checkpoint() (err error) {
 
 	oldMeta, err := loadMeta(s.client, s.idx)
 	if err == ErrCheckpointNotFound {
-		log.Infoln("Do not have existing checkpoint.")
+		log.Info("Do not have existing checkpoint.")
 		err = nil
 	}
 
@@ -361,7 +386,7 @@ func (s *Service) checkpoint() (err error) {
 		if rmErr != nil {
 			// log error, but still treat checkpoint as
 			// successful.
-			log.Errorln(rmErr)
+			log.Error("remove old meta file error", log.Ctx{"error": rmErr})
 		}
 	}
 
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index b435de80a224571d16efdee168541aa301c3f73a..7d2becbdd772747d77890321fce6721d8d17fb30 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,27 +1,32 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
-add_subdirectory(testing)
 add_subdirectory(math)
-add_subdirectory(parameter)
 add_subdirectory(gserver)
-add_subdirectory(pserver)
-add_subdirectory(trainer)
-add_subdirectory(scripts)
-add_subdirectory(string)
-
-if(Boost_FOUND)
-  add_subdirectory(memory)
-  add_subdirectory(platform)
-  add_subdirectory(framework)
-  add_subdirectory(operators)
-  add_subdirectory(pybind)
-endif()
+add_subdirectory(parameter)
+add_subdirectory(testing)
 
-if(WITH_C_API)
+if(MOBILE_INFERENCE)
   add_subdirectory(capi)
-endif()
+else()
+  add_subdirectory(pserver)
+  add_subdirectory(trainer)
+  add_subdirectory(string)
+  add_subdirectory(scripts)
+
+  if(WITH_C_API)
+    add_subdirectory(capi)
+  endif()
+
+  if(Boost_FOUND)
+    add_subdirectory(memory)
+    add_subdirectory(platform)
+    add_subdirectory(framework)
+    add_subdirectory(operators)
+    add_subdirectory(pybind)
+  endif()
 
-if(WITH_SWIG_PY)
-  add_subdirectory(api)
+  if(WITH_SWIG_PY)
+    add_subdirectory(api)
+  endif()
 endif()
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index d7b3d2bdec1687425df804c0d56d568241f9e8b0..d6b8464100d4497876aa3f6f7cbc666aafae4bfc 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -26,7 +26,7 @@ FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
 SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-parentheses-equality -Wno-missing-field-initializers -Wno-self-assign -ftls-model=global-dynamic")
 
 SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_parameter
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index d369df5d4e04b4a8d822db0e72a8051150868ce6..11bd05c09d1ecbbcec6b6596c16416c26635a072 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -47,7 +47,7 @@ bool isUsingGpu() { return FLAGS_use_gpu; }
 void setUseGpu(bool useGpu) { FLAGS_use_gpu = useGpu; }
 
 bool isGpuVersion() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index b9bbe58951c643f1b1649858880fbd2ba3a2a7b7..e767856d5012fd205f6b57f9721d0cbca8dc46ed 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -28,25 +28,37 @@ add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
 
 add_dependencies(paddle_capi paddle_proto)
 
-# combine all paddle static libraries together, into libpaddle_capi_whole.a
-# user should use PaddleCAPI as -lpaddle_capi_whole
-set(PADDLE_CAPI_INFER_LIBS
-    paddle_utils
-    paddle_parameter
-    paddle_math
-    paddle_cuda
-    paddle_function
-    paddle_gserver
-    paddle_proto
-    paddle_pserver
-    paddle_network)
-
+# TODO: paddle_capi_whole will be removed.
+if(MOBILE_INFERENCE)
+    set(PADDLE_CAPI_INFER_LIBS
+        paddle_utils
+        paddle_parameter
+        paddle_math
+        paddle_cuda
+        paddle_function
+        paddle_gserver
+        paddle_proto)
+else()
+    set(PADDLE_CAPI_INFER_LIBS
+        paddle_utils
+        paddle_parameter
+        paddle_math
+        paddle_cuda
+        paddle_function
+        paddle_gserver
+        paddle_proto
+        paddle_pserver
+        paddle_network)
+endif()
 cc_library(paddle_capi_whole DEPS paddle_capi ${PADDLE_CAPI_INFER_LIBS})
 
-# No shared library for iOS
+# Link the static library for inference
+cc_library(paddle_capi_engine DEPS paddle_capi paddle_utils paddle_parameter paddle_math paddle_cuda paddle_proto)
+cc_library(paddle_capi_layers DEPS paddle_function paddle_gserver)
+
+# Link the shared library for inference
 if(NOT IOS)
-  set(LINK_FLAGS " -Wl,--retain-symbols-file ${CMAKE_CURRENT_SOURCE_DIR}/export.sym -Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/export.map")
-  # TODO: merge mkl into paddle_capi_shared
+  set(LINK_FLAGS "-Wl,--version-script ${CMAKE_CURRENT_SOURCE_DIR}/paddle_capi.map")
   add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
   set_target_properties(paddle_capi_shared	PROPERTIES LINK_FLAGS "${LINK_FLAGS}")
   target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
@@ -55,9 +67,10 @@ endif()
 
 # install library & headers.
 install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES paddle_capi.map DESTINATION include/paddle)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
 if(ANDROID)
-  install(TARGETS paddle_capi_whole paddle_capi_shared
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers paddle_capi_shared
           ARCHIVE DESTINATION lib/${ANDROID_ABI}
           LIBRARY DESTINATION lib/${ANDROID_ABI})
   execute_process(
@@ -82,7 +95,7 @@ if(ANDROID)
       )"
   )
 else(ANDROID)
-  install(TARGETS paddle_capi_whole ARCHIVE DESTINATION lib)
+  install(TARGETS paddle_capi_whole paddle_capi_engine paddle_capi_layers ARCHIVE DESTINATION lib)
   if(NOT IOS)
     install(TARGETS paddle_capi_shared DESTINATION lib)
   endif()
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d898ebe2612d749ca261d35139d1cd45bd355eef..4547afaf1dc9af8bc7909a684db766fdd7b159c0 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -46,7 +46,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
   if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
   paddle::real* buf = ptr->mat->getRowBuf(rowID);
   size_t width = ptr->mat->getWidth();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
 #else
   std::copy(rowArray, rowArray + width, buf);
diff --git a/paddle/capi/export.sym b/paddle/capi/export.sym
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/paddle/capi/export.map b/paddle/capi/paddle_capi.map
similarity index 100%
rename from paddle/capi/export.map
rename to paddle/capi/paddle_capi.map
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
index 8208808b94f54f2ddaf4d426a65b8db562b36aca..bb38ace62808db5ce95a1a57ff465e8edc059213 100644
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -4,11 +4,12 @@ add_unittest(capi_test_mats test_Vector.cpp
 target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
 target_link_libraries(capi_test_mats paddle_capi)
 
-
-add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
-target_include_directories(capi_test_gradientMachine PUBLIC
-  ${PADDLE_CAPI_INC_PATH})
-target_link_libraries(capi_test_gradientMachine paddle_capi)
-add_test(NAME capi_test_gradientMachine
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
+if(NOT MOBILE_INFERENCE)
+    add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
+    target_include_directories(capi_test_gradientMachine PUBLIC
+      ${PADDLE_CAPI_INC_PATH})
+    target_link_libraries(capi_test_gradientMachine paddle_capi)
+    add_test(NAME capi_test_gradientMachine
+      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
+endif()
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4aaa43d79612111856dd4dfc954ca2bfd8f4fa63..85374a476d51dc4c0e22793e8b53d6d7ba21c8da 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,4 +1,7 @@
 # ddim lib
+proto_library(framework_proto SRCS framework.proto)
+proto_library(saver_proto SRCS framework.proto saver.proto)
+
 cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
@@ -7,8 +10,8 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
-cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor)
-cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
+cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor saver_proto framework_proto)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor paddle_memory)
 nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
 
 cc_test(variable_test SRCS variable_test.cc)
@@ -16,20 +19,18 @@ cc_test(variable_test SRCS variable_test.cc)
 cc_library(scope SRCS scope.cc)
 cc_test(scope_test SRCS scope_test.cc DEPS scope)
 
-proto_library(framework_proto SRCS framework.proto)
 
 cc_library(attribute SRCS attribute.cc DEPS framework_proto)
-cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute)
+cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc)
 cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
 cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope)
+cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS attribute ddim op_info operator)
 
-cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS operator)
-cc_library(op_registry SRCS op_registry.cc DEPS grad_op_builder op_proto_maker op_info)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator glog proto_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
-cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
 
 py_proto_compile(framework_py_proto SRCS framework.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
@@ -43,3 +44,16 @@ add_custom_command(TARGET framework_py_proto POST_BUILD
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward recurrent_op device_context)
+
+cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto backward glog)
+
+cc_library(prune SRCS prune.cc DEPS framework_proto)
+cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
+
+cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
+cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
+
+cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
+        proto_desc)
+cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
+cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
diff --git a/paddle/framework/attribute.cc b/paddle/framework/attribute.cc
index d6a2975aaa419406aef7b228e78381dbce78890d..29fe352ca450740e55ee87b63392e3aabac8aa40 100644
--- a/paddle/framework/attribute.cc
+++ b/paddle/framework/attribute.cc
@@ -19,19 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-static ProgramDesc* g_program_desc = nullptr;
-
-ProgramDesc& GetProgramDesc() {
-  if (g_program_desc == nullptr) {
-    g_program_desc = new ProgramDesc();
-    auto root_block = g_program_desc->mutable_blocks()->Add();
-    root_block->set_idx(0);
-    root_block->set_parent_idx(-1);
-  }
-  return *g_program_desc;
-}
-
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
+Attribute GetAttrValue(const OpDesc::Attr& attr_desc, ProgramDesc* program) {
   switch (attr_desc.type()) {
     case framework::AttrType::BOOLEAN: {
       return attr_desc.b();
@@ -74,7 +62,9 @@ Attribute GetAttrValue(const OpDesc::Attr& attr_desc) {
       return val;
     }
     case framework::AttrType::BLOCK: {
-      return GetProgramDesc().mutable_blocks(attr_desc.block_idx());
+      PADDLE_ENFORCE(program != nullptr,
+                     "Need to specify ProgramDesc when get a block attr");
+      return program->mutable_blocks(attr_desc.block_idx());
     }
   }
   PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index c7559cefb6415ee141f32e4357459653564cd2ac..9744662b8f7229b0b17e910ae5cd997fa7d31e06 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -21,29 +21,18 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/platform/enforce.h"
-#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
-
-// The order should be as same as framework.proto
-typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
-                       std::vector<float>, std::vector<std::string>, bool,
-                       std::vector<bool>, BlockDesc*>
-    Attribute;
-
-typedef std::unordered_map<std::string, Attribute> AttributeMap;
-
-ProgramDesc& GetProgramDesc();
-
 template <typename T>
 inline AttrType AttrTypeID() {
   Attribute tmp = T();
   return static_cast<AttrType>(tmp.which() - 1);
 }
 
-Attribute GetAttrValue(const OpDesc::Attr& attr_desc);
+Attribute GetAttrValue(const OpDesc::Attr& attr_desc, ProgramDesc* desc);
 
 class AttrReader {
  public:
@@ -128,6 +117,57 @@ class EnumInContainer {
   std::unordered_set<T> container_;
 };
 
+template <typename T>
+struct ExtractAttribute {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  T* operator()(Attribute& attr) const {
+    T* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<T>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
+                   attr_name_, typeid(T).name(), attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
+// special handle bool
+// FIXME(yuyang18): Currently we cast bool into int in python binding. It is
+// hard to change the logic there. In another way, we should correct handle
+// if the user set `some_flag=1`.
+//
+// FIX ME anytime if there is a better solution.
+template <>
+struct ExtractAttribute<bool> {
+  explicit ExtractAttribute(const std::string& attr_name)
+      : attr_name_(attr_name) {}
+
+  bool* operator()(Attribute& attr) const {
+    if (attr.type() == typeid(int)) {  // NOLINT
+      int val = boost::get<int>(attr);
+      attr = static_cast<bool>(val);
+    } else if (attr.type() == typeid(float)) {  // NOLINT
+      float val = boost::get<float>(attr);
+      attr = static_cast<bool>(val);
+    }
+    bool* attr_value = nullptr;
+    try {
+      attr_value = &boost::get<bool>(attr);
+    } catch (boost::bad_get& bad_get) {
+      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
+                   attr_name_, attr.type().name());
+    }
+    return attr_value;
+  }
+
+  const std::string& attr_name_;
+};
+
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@@ -179,9 +219,10 @@ class TypedAttrChecker {
       attr_map[attr_name_] = val;
     }
     Attribute& attr = attr_map.at(attr_name_);
-    T& attr_value = boost::get<T>(attr);
+    ExtractAttribute<T> extract_attr(attr_name_);
+    T* attr_value = extract_attr(attr);
     for (const auto& checker : value_checkers_) {
-      checker(attr_value);
+      checker(*attr_value);
     }
   }
 
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index 0ec18de5b8a0e7cebdb91c30d2b45596b02dfa51..1ae7fb60f01e4925ceb310f661171eb231eb6c96 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -13,17 +13,51 @@
    limitations under the License. */
 
 #include "paddle/framework/backward.h"
+#include "paddle/operators/net_op.h"
 
+#include <deque>
 #include <list>
 #include <memory>
 
+#include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 
 namespace paddle {
 namespace framework {
 
+static inline std::unique_ptr<OperatorBase> CreateGradOp(
+    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  OpDescBind op_desc;
+  op_desc.SetInputMap(op.Inputs());
+  op_desc.SetOutputMap(op.Outputs());
+  op_desc.SetType(op.Type());
+  op_desc.SetAttrMap(op.Attrs());
+  auto& info = OpInfoMap::Instance().Get(op.Type());
+  auto grad_descs = info.GradOpMaker()(op_desc, no_grad_set, grad_to_var);
+  std::vector<std::unique_ptr<OperatorBase>> grad_ops;
+  grad_ops.reserve(grad_descs.size());
+  std::transform(grad_descs.begin(), grad_descs.end(),
+                 std::back_inserter(grad_ops),
+                 [](const std::unique_ptr<OpDescBind>& grad_desc) {
+                   return OpRegistry::CreateOp(*grad_desc);
+                 });
+  PADDLE_ENFORCE(!grad_ops.empty());
+  if (grad_ops.size() == 1) {
+    return std::move(grad_ops[0]);
+  } else {
+    auto net_op = new operators::NetOp();
+    for (auto& grad_op : grad_ops) {
+      net_op->AppendOp(std::move(grad_op));
+    }
+    net_op->CompleteAddOp();
+    return std::unique_ptr<OperatorBase>(net_op);
+  }
+}
+
 template <typename Map, typename T>
 static void ForEachVarName(const Map& names, T callback) {
   for (auto& name : names) {
@@ -66,7 +100,9 @@ static std::unique_ptr<OperatorBase> NOP() {
 //  See Backward.h for details
 static std::unique_ptr<OperatorBase> BackwardRecursive(
     const OperatorBase& forwardOp,
-    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+    std::unordered_set<std::string>& no_grad_names,
+    std::unordered_map<std::string, std::string>* grad_to_var,
+    size_t& uniq_id) {
   //  If all input gradients of forwarding operator do not need to calculate,
   //  just return an NOP. Not return null ptr because NOP does not take
   //  too much time for calculation, but it is useful for simplifying logic.
@@ -104,7 +140,7 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
     for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
          ++it, ++local_op_id) {
       auto& fwd = *it;
-      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, grad_to_var, uniq_id);
       ForEachVarName(bwd->Outputs(),
                      [&dup_output_ops, local_op_id](const std::string& out) {
                        dup_output_ops[out].emplace_back(local_op_id);
@@ -140,13 +176,14 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                               std::to_string(i));
         net->ops_[op_offset]->Rename(name, dup_outputs.back());
       }
-      // collect all the offset to append `add` op for each alias
+      // collect all the offset for each alias,
+      // insert a sum operator to add all aliases to output
       insert_position.push_back(
-          {dup_op.back(), OpRegistry::CreateOp("add", {{"X", {dup_outputs}}},
+          {dup_op.back(), OpRegistry::CreateOp("sum", {{"X", dup_outputs}},
                                                {{"Out", {name}}}, {})});
     }
 
-    // make sure the inserted `add` ops follow the BFS order.
+    // make sure the inserted `sum` ops follow the BFS order.
     insert_position.sort(
         [](const Pos& l, const Pos& r) { return l.first > r.first; });
 
@@ -154,7 +191,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
       net->InsertOp(pos.first + 1, std::move(pos.second));
     }
   } else {
-    std::unique_ptr<OperatorBase> grad_op(OpRegistry::CreateGradOp(forwardOp));
+    std::unique_ptr<OperatorBase> grad_op(
+        CreateGradOp(forwardOp, no_grad_names, grad_to_var));
 
     ForEachVarName(grad_op->Inputs(), [&no_grad_names, &net, &grad_op](
                                           const std::string& grad_input) {
@@ -182,8 +220,8 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
 
     // process recurrent gradient op as a special operator.
     if (forwardOp.Type() == "recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself), or
-      // this will result in infinite loop.
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or this will result in infinite loop.
       const auto& rnnop =
           *static_cast<const operators::RecurrentOp*>(&forwardOp);
       auto rnn_grad_op =
@@ -192,7 +230,19 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
           *static_cast<const OperatorBase*>(&rnnop.stepnet());
       // create stepnet's gradient op
       rnn_grad_op->set_stepnet(
-          BackwardRecursive(stepnet_op, no_grad_names, uniq_id));
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
+    } else if (forwardOp.Type() == "dynamic_recurrent") {
+      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
+      // or this will result in infinite loop.
+      const auto& rnnop =
+          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
+      auto rnn_grad_op =
+          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
+      const auto& stepnet_op =
+          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
+      // create stepnet's gradient op
+      rnn_grad_op->rnn.SetStepUnit(
+          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
     }
 
     if (net->ops_.empty()) {  // Current no aux op is added to network
@@ -219,7 +269,228 @@ std::unique_ptr<OperatorBase> Backward(
     no_grad_names.insert(name + kGradVarSuffix);
   }
   size_t uid = 0;
-  return BackwardRecursive(forwardOp, no_grad_names, uid);
+  std::unordered_map<std::string, std::string> grad_to_var;
+  return BackwardRecursive(forwardOp, no_grad_names, &grad_to_var, uid);
+}
+
+// ====================================  //
+
+static bool AllGradInSet(const std::vector<std::string>& names,
+                         const std::unordered_set<std::string>& set) {
+  for (const std::string& name : names) {
+    if (!set.count(GradVarName(name))) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static void CreateGradVarInBlock(
+    size_t grad_op_start_index,
+    const std::unordered_map<std::string, std::string>& param_name_map,
+    BlockDescBind* block_desc,
+    std::unordered_map<std::string, GradVarInfo>* grad_var_record) {
+  auto ops = block_desc->AllOps();
+  for (size_t op_index = grad_op_start_index; op_index < ops.size();
+       ++op_index) {
+    bool need_infer_shape = false;
+    ForEachVarName(ops[op_index]->Outputs(),
+                   [&](const std::string& grad_var_name) {
+                     if (block_desc->HasVar(grad_var_name)) {
+                       return false;
+                     }
+                     need_infer_shape = true;
+                     auto var = block_desc->Var(grad_var_name);
+                     // FIXME(qiao) infer the datatype
+                     var->SetDataType(framework::DataType::FP32);
+                     auto it = param_name_map.find(grad_var_name);
+                     if (it == param_name_map.end()) {
+                       return false;
+                     }
+                     auto param_var_name = it->second;
+                     auto& grad_record = (*grad_var_record)[param_var_name];
+                     grad_record.name_ = grad_var_name;
+                     grad_record.block_idx_ = block_desc->ID();
+                     grad_record.op_idx_ = static_cast<int>(op_index);
+                     return false; /* not break */
+                   });
+    if (need_infer_shape) {
+      ops[op_index]->InferShape(*block_desc);
+    }
+  }
+}
+
+std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
+    const OpDescBind* op_desc, std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  std::vector<std::unique_ptr<OpDescBind>> grad_op_descs;
+  // All input gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
+  if (AllGradInSet(inputs, *no_grad_vars)) {
+    return grad_op_descs;  // empty vector
+  }
+  // All output gradients of forwarding operator do not need to calculate.
+  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
+  if (AllGradInSet(outputs, *no_grad_vars)) {
+    for (const std::string& name : inputs) {
+      no_grad_vars->insert(GradVarName(name));
+    }
+    return grad_op_descs;  // empty vector
+  }
+
+  grad_op_descs = OpInfoMap::Instance()
+                      .Get(op_desc->Type())
+                      .GradOpMaker()(*op_desc, *no_grad_vars, grad_to_var);
+
+  std::list<std::unique_ptr<OpDescBind>> pending_fill_zeros_ops;
+  for (auto& desc : grad_op_descs) {
+    for (const std::string& in_name : desc->InputArgumentNames()) {
+      if (no_grad_vars->count(in_name)) {
+        std::string prefix = in_name.substr(
+            0, in_name.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
+        std::string new_name = prefix + kZeroVarSuffix;
+        desc->Rename(in_name, new_name);
+        std::unique_ptr<OpDescBind> fill_zeros_op(new OpDescBind(
+            "fill_zeros_like", {{"X", {prefix}}}, {{"Y", {new_name}}}, {}));
+        pending_fill_zeros_ops.push_back(std::move(fill_zeros_op));
+      }
+    }
+  }
+
+  for (auto& p : pending_fill_zeros_ops) {
+    grad_op_descs.insert(grad_op_descs.begin(), std::move(p));
+  }
+  return grad_op_descs;
+}
+
+std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
+    ProgramDescBind& program_desc, int block_idx,
+    std::unordered_set<std::string>* no_grad_vars,
+    std::unordered_map<std::string, std::string>* grad_to_var) {
+  BlockDescBind* cur_block = program_desc.Block(block_idx);
+  std::vector<OpDescBind*> op_descs = cur_block->AllOps();
+  std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
+  size_t grad_desc_idx = 0;
+  std::vector<std::unique_ptr<OpDescBind>> backward_descs;
+
+  for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
+    std::vector<std::unique_ptr<OpDescBind>> op_grads =
+        MakeOpGrad(*it, no_grad_vars, grad_to_var);
+
+    if ((*it)->Type() == "recurrent") {
+      PADDLE_ENFORCE_EQ(
+          op_grads.size(), static_cast<size_t>(1),
+          "rnn_op's gradient process should contain only one op.");
+      int step_block_idx = (*it)->GetBlockAttr("step_block");
+      auto backward_block_op_descs = MakeBlockBackward(
+          program_desc, step_block_idx, no_grad_vars, grad_to_var);
+      BlockDescBind* backward_block = program_desc.AppendBlock(*cur_block);
+      for (auto& ptr : backward_block_op_descs) {
+        backward_block->AppendAllocatedOp(std::move(ptr));
+      }
+      op_grads[0]->SetBlockAttr("step_block", *backward_block);
+    }
+
+    for (const auto& desc : op_grads) {
+      for (const std::string& out_name : desc->OutputArgumentNames()) {
+        dup_out_ops[out_name].emplace_back(grad_desc_idx);
+      }
+      ++grad_desc_idx;
+    }
+    std::transform(
+        op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
+        [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
+  }
+  // Check whether some variables are written more than once
+  std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
+  for (const auto& dup : dup_out_ops) {
+    const std::string& out_name = dup.first;
+    const std::vector<size_t> dup_op = dup.second;
+    if (out_name != kEmptyVarName && dup_op.size() > 1) {
+      std::vector<std::string> sum_op_inputs;
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        std::string new_name = out_name + "@RENAME@" + std::to_string(i);
+        backward_descs[dup_op[i]]->Rename(out_name, new_name);
+        sum_op_inputs.emplace_back(new_name);
+      }
+      std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
+          "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
+      pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
+    }
+  }
+  pending_sum_ops.sort(
+      [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
+         const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
+        return a.first > b.first;
+      });
+  for (auto& p : pending_sum_ops) {
+    backward_descs.insert(backward_descs.begin() + p.first + 1,
+                          std::move(p.second));
+  }
+
+  return backward_descs;
+}
+
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_var_names;
+  no_grad_var_names.reserve(no_grad_vars.size() + 1);
+  no_grad_var_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);
+  for (auto& name : no_grad_vars) {
+    no_grad_var_names.insert(GradVarName(name));
+  }
+
+  const int root_block_idx = 0;
+  auto root_block = program_desc.Block(root_block_idx);
+
+  // insert fill one op for target
+  // TODO(qiao) add some check to the target.
+  std::string fill_one_op_out = GradVarName(target.Name());
+  std::vector<int64_t> target_shape_desc = target.Shape();
+  std::vector<int> target_shape;
+  std::transform(target_shape_desc.begin(), target_shape_desc.end(),
+                 std::back_inserter(target_shape),
+                 [](int64_t dim) { return static_cast<int>(dim); });
+  std::unique_ptr<OpDescBind> fill_one_op(
+      new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
+                     {{"shape", target_shape},
+                      {"value", static_cast<float>(1.0)},
+                      {"data_type", framework::DataType::FP32}}));
+  root_block->AppendAllocatedOp(std::move(fill_one_op));
+  size_t forward_op_num = root_block->OpSize();
+  size_t forward_block_num = program_desc.Size();
+
+  // Insert backward operators
+  std::unordered_map<std::string, std::string> grad_to_var;
+  auto backward_op_descs = MakeBlockBackward(program_desc, root_block_idx,
+                                             &no_grad_var_names, &grad_to_var);
+
+  for (auto& ptr : backward_op_descs) {
+    root_block->AppendAllocatedOp(std::move(ptr));
+  }
+  // Create Variable
+
+  // Create target gradient variable
+  std::unordered_map<std::string, GradVarInfo> retv;
+
+  auto var = root_block->Var(fill_one_op_out);
+  // FIXME(qiao) infer the data type
+  var->SetDataType(framework::DataType::FP32);
+  var->SetShape(target.Shape());
+  auto& target_grad = retv[target.Name()];
+  target_grad.name_ = fill_one_op_out;
+  target_grad.block_idx_ = root_block_idx;
+  target_grad.op_idx_ = static_cast<int>(forward_op_num);
+
+  // create grad_var for all blocks in this program
+  CreateGradVarInBlock(forward_op_num, grad_to_var, root_block, &retv);
+  for (size_t block_index = forward_block_num;
+       block_index < program_desc.Size(); ++block_index) {
+    CreateGradVarInBlock(0, grad_to_var, program_desc.Block(block_index),
+                         &retv);
+  }
+  return retv;
 }
 
 }  // namespace framework
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
index 1ecf69881b3126c2904920b9f4b77bfcccc9cf86..96154fa82cb7a486aa4762ae633982ed6735220b 100644
--- a/paddle/framework/backward.h
+++ b/paddle/framework/backward.h
@@ -13,8 +13,14 @@
    limitations under the License. */
 
 #pragma once
+
+#include <string>
+#include <unordered_map>
 #include <unordered_set>
-#include "operator.h"
+
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
 namespace paddle {
 namespace framework {
 
@@ -23,5 +29,28 @@ namespace framework {
 extern std::unique_ptr<OperatorBase> Backward(
     const OperatorBase& forwardOp,
     const std::unordered_set<std::string>& no_grad_vars);
+
+struct GradVarInfo {
+  GradVarInfo() {}
+  GradVarInfo(const std::string& name, int block_idx, int op_idx)
+      : name_(name), block_idx_(block_idx), op_idx_(op_idx) {}
+
+  bool operator==(const GradVarInfo& b) const {
+    return name_ == b.name_ && block_idx_ == b.block_idx_ &&
+           op_idx_ == b.op_idx_;
+  }
+
+  std::string name_;
+  int block_idx_;
+  int op_idx_;
+};
+
+using ParamGradInfoMap = std::unordered_map<std::string /*fwd_var_name*/,
+                                            GradVarInfo /*grad_var_info*/>;
+
+ParamGradInfoMap AppendBackward(
+    ProgramDescBind& program_desc, const VarDescBind& target,
+    const std::unordered_set<std::string>& no_grad_vars);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
index 6932f5b989a3e21ebc44ec4fec9f5223f2547d7a..10301f7e39423c8ff0eba33277edecab14c119bf 100644
--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -15,30 +15,57 @@
 #include "paddle/framework/backward.h"
 
 #include <gtest/gtest.h>
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/var_desc.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace framework {
 
-using OperatorBase = framework::OperatorBase;
-using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
-using OpProto = framework::OpProto;
-using OpAttrChecker = framework::OpAttrChecker;
-using Scope = framework::Scope;
 using DeviceContext = platform::DeviceContext;
 
+class NoneOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {}
+};
+
+template <typename Place, typename T>
+class NoneKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {}
+};
+
 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
  public:
   RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "Input X of Add").NotInGradient();
-    AddInput("b", "Bias of Add").NotInGradient();
-    AddOutput("Out", "Out of Add").NotInGradient();
+    AddInput("X", "Input X of Add");
+    AddInput("b", "Bias of Add");
+    AddOutput("Out", "Out of Add");
     AddComment("Add Op");
   }
 };
 
+class RowWiseAddGradMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<OpDescBind> Apply() const override {
+    auto grad_op = new OpDescBind();
+    grad_op->SetInput(GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(GradVarName("X"), InputGrad("X"));
+    grad_op->SetOutput(GradVarName("b"), InputGrad("b"));
+    grad_op->SetType("rowwise_add_grad");
+    return std::unique_ptr<OpDescBind>(grad_op);
+  }
+};
+
 class MulOpMaker : public OpProtoAndCheckerMaker {
  public:
   MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
@@ -46,6 +73,8 @@ class MulOpMaker : public OpProtoAndCheckerMaker {
     AddInput("X", "A");
     AddInput("Y", "B");
     AddOutput("Out", "Out");
+    AddAttr<int>("x_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
+    AddAttr<int>("y_num_col_dims", "").SetDefault(1).EqualGreaterThan(1);
     AddComment("Mul");
   }
 };
@@ -133,42 +162,118 @@ class FillZeroOpMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-class AddOpMaker : public OpProtoAndCheckerMaker {
+class SumOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SumOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "x").AsDuplicable();
-    AddOutput("Out", "out");
+    AddInput("X", "the input tensors of sum operator.").AsDuplicable();
+    AddOutput("Out", "the output tensor of sum operator.");
     AddComment("");
   }
 };
+
+class MultInOutOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MultInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("H", "h");
+    AddOutput("Y", "y");
+    AddOutput("Z", "z");
+    AddComment("");
+  }
+};
+
+class MinusGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *op_desc = new OpDescBind();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", x_g);
+      op_desc->SetAttr("scale", 1.0f);
+      retv.emplace_back(op_desc);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *op_desc = new OpDescBind();
+      op_desc->SetType("scale");
+      op_desc->SetInput("X", OutputGrad("Out"));
+      op_desc->SetOutput("Out", y_g);
+      op_desc->SetAttr("scale", -1.0f);
+      retv.emplace_back(op_desc);
+    }
+    return retv;
+  }
+};
+
+class MinusOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MinusOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "");
+    AddInput("Y", "");
+    AddOutput("Out", "");
+    AddComment("minus for unittest");
+  }
+};
 }  // namespace framework
 }  // namespace paddle
 
 namespace f = paddle::framework;
 namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
-REGISTER_OP(rowwise_add, f::NOP, f::RowWiseAddOpMaker, rowwise_add_grad,
-            f::NOP);
-REGISTER_OP(mul, f::NOP, f::MulOpMaker, mul_grad, f::NOP);
-REGISTER_OP(sigmoid, f::NOP, f::SigmoidOpMaker, sigmoid_grad, f::NOP);
-REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NOP, f::NoGradOpMaker);
-REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NOP, f::FillZeroOpMaker);
-REGISTER_OP(add, f::NOP, f::AddOpMaker, add_grad, f::NOP);
+// rowwise_add
+REGISTER_OPERATOR(rowwise_add, f::NoneOp, f::RowWiseAddOpMaker,
+                  f::RowWiseAddGradMaker);
+REGISTER_OP_CPU_KERNEL(rowwise_add,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OPERATOR(rowwise_add_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(rowwise_add_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// mul
+REGISTER_OP(mul, f::NoneOp, f::MulOpMaker, mul_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(mul, f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// sigmoid
+REGISTER_OP(sigmoid, f::NoneOp, f::SigmoidOpMaker, sigmoid_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_WITHOUT_GRADIENT(nograd, f::NoneOp, f::NoGradOpMaker);
+// fill_zeros_like
+REGISTER_OP_WITHOUT_GRADIENT(fill_zeros_like, f::NoneOp, f::FillZeroOpMaker);
+REGISTER_OP_CPU_KERNEL(fill_zeros_like,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// sum
+REGISTER_OP(sum, f::NoneOp, f::SumOpMaker, sum_grad, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(sum, f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sum_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// fc
 REGISTER_OP_WITHOUT_GRADIENT(fc, f::FcOp, f::FcOpMaker);
-REGISTER_OP(many_output_op, f::NOP, f::ManyOutputOpMaker, many_output_op_grad,
-            f::NOP);
-
-TEST(Backward, simple_op_grad) {
-  auto fwd = f::OpRegistry::CreateOp(
-      "rowwise_add", {{"X", {"x"}}, {"b", {"b"}}}, {{"Out", {"out"}}}, {});
-  ASSERT_NE(fwd, nullptr);
-  auto gop = f::OpRegistry::CreateGradOp(*fwd);
-  ASSERT_EQ(1UL, gop->Inputs().size());
-  ASSERT_EQ("rowwise_add_grad", gop->Type());
-  ASSERT_EQ(f::GradVarName("x"), gop->Output(f::GradVarName("X")));
-  ASSERT_EQ(f::GradVarName("b"), gop->Output(f::GradVarName("b")));
-}
+// many_output_op
+REGISTER_OP(many_output_op, f::NoneOp, f::ManyOutputOpMaker,
+            many_output_op_grad, f::NoneOp);
+// mult_in_out
+REGISTER_OP(mult_in_out, f::NoneOp, f::MultInOutOpMaker, mult_in_out_grad,
+            f::NoneOp);
+REGISTER_OP_CPU_KERNEL(mult_in_out,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mult_in_out_grad,
+                       f::NoneKernel<paddle::platform::CPUPlace, float>);
+// minus
+REGISTER_OPERATOR(minus, f::NoneOp, f::MinusOpMaker, f::MinusGradOpDescMaker);
+REGISTER_OP_CPU_KERNEL(minus, f::NoneKernel<paddle::platform::CPUPlace, float>);
+// scale
+REGISTER_OPERATOR(scale, f::NoneOp);
+REGISTER_OP_CPU_KERNEL(scale, f::NoneKernel<paddle::platform::CPUPlace, float>);
 
 TEST(Backward, simple_op_not_need_grad) {
   auto fwd = f::OpRegistry::CreateOp(
@@ -283,18 +388,7 @@ TEST(Backward, net_shared_weight) {
   ASSERT_TRUE(bwd->IsNetOp());
   auto bwd_net = static_cast<ops::NetOp *>(bwd.get());
   ASSERT_EQ(3UL, bwd_net->ops_.size());
-  ASSERT_EQ("add", bwd_net->ops_[2]->Type());
-}
-
-TEST(Backward, op_register_grad_not_for_network) {
-  auto fwd =
-      f::OpRegistry::CreateOp("fc", {{"X", {"x"}}, {"W", {"w"}}, {"b", {"b"}}},
-                              {{"mul_result", {"mul_out"}},
-                               {"add_result", {"add_out"}},
-                               {"Out", {"out1"}}},
-                              {{"temporary_index", std::vector<int>{0, 1}}});
-
-  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
+  ASSERT_EQ("sum", bwd_net->ops_[2]->Type());
 }
 
 TEST(Backward, op_all_input_are_not_need) {
@@ -388,14 +482,418 @@ TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
             2UL       /* external input number */
                 + 1UL /* external output number*/
                 + 1UL /* number of gradient of external output*/
-                + 2U /* internal variable number*/);
+                + 2UL /* internal variable number*/
+            );
   EXPECT_EQ(grad_fc.Outputs(all).size(),
             2UL       /* input number of mul*/
-                + 2UL /* input number of rowwise_add
-                       */
-                + 1UL /* input number of sigmod */);
+                + 2UL /* input number of rowwise_add*/
+                + 1UL /* input number of sigmod */
+                - 1UL /* out2 is not needed*/);
   EXPECT_EQ(bwd_net->ops_[1]->Inputs(all).size(), 0UL);
   EXPECT_EQ(bwd_net->ops_[1]->Outputs(all).size(), 0UL);
   EXPECT_EQ(bwd_net->ops_[2]->Inputs(all).size(), 0UL);
   EXPECT_EQ(bwd_net->ops_[2]->Outputs(all).size(), 0UL);
 }
+
+TEST(Backward, simple_single_op) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+
+  f::OpDescBind *op = block->AppendOp();
+  op->SetType("rowwise_add");
+  op->SetInput("X", {"x"});
+  op->SetInput("b", {"b"});
+  op->SetOutput("Out", {"out"});
+
+  auto target = f::VarDescBind("out");
+  auto var_to_grad = AppendBackward(program, target, {});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  f::OpDescBind *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op = block->AllOps()[2];
+  EXPECT_EQ(grad_op->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x")}));
+  EXPECT_EQ(grad_op->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b")}));
+
+  EXPECT_EQ(var_to_grad.size(), 3UL);
+  EXPECT_EQ(var_to_grad.at("b"), f::GradVarInfo(f::GradVarName("b"), 0, 2));
+  EXPECT_EQ(var_to_grad.at("x"), f::GradVarInfo(f::GradVarName("x"), 0, 2));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x")));
+}
+
+TEST(Backward, default_attribute) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op = block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {"x"});
+  op->SetInput("Y", {"y"});
+  op->SetOutput("Out", {"out"});
+  op->CheckAttrs();
+
+  auto target = f::VarDescBind("out");
+  AppendBackward(program, target, {});
+
+  ASSERT_EQ(block->AllOps().size(), 3UL);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(op->GetAttr("y_num_col_dims")), 1);
+
+  f::OpDescBind *fill_op = block->AllOps()[1];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op = block->AllOps()[2];
+  ASSERT_EQ(grad_op->Type(), "mul_grad");
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("x_num_col_dims")), 1);
+  EXPECT_EQ(boost::get<int>(grad_op->GetAttr("y_num_col_dims")), 1);
+}
+
+TEST(Backward, simple_mult_op) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  auto target = f::VarDescBind("out3");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL + 1);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDescBind *grad_op2 = block->AllOps()[5];
+  EXPECT_EQ(grad_op2->Type(), "mul_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  EXPECT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  EXPECT_EQ(var_to_grad.size(), 7UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out2"),
+            f::GradVarInfo(f::GradVarName("out2"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+}
+
+TEST(Backward, intermedia_var_no_grad) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"x2"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out2"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  f::OpDescBind *op4 = block->AppendOp();
+  op4->SetType("mul");
+  op4->SetInput("X", {"out1"});
+  op4->SetInput("Y", {"out3"});
+  op4->SetOutput("Out", {"out4"});
+
+  auto target = f::VarDescBind("out4");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"out3"});
+
+  ASSERT_EQ(block->AllOps().size(), 7UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op1 = block->AllOps()[6];
+  EXPECT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  EXPECT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"out3"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out4"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out4")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")), std::vector<std::string>());
+
+  EXPECT_EQ(var_to_grad.size(), 4UL);
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+}
+
+TEST(Backward, var_no_grad) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("mult_in_out");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("H", {"h1"});
+  op1->SetOutput("Y", {"y1"});
+  op1->SetOutput("Z", {"z1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mult_in_out");
+  op2->SetInput("X", {"y1"});
+  op2->SetInput("H", {"z1"});
+  op2->SetOutput("Y", {"y2"});
+  op2->SetOutput("Z", {"z2"});
+
+  auto target = f::VarDescBind("z2");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"z1"});
+
+  ASSERT_EQ(block->AllOps().size(), 6UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op2 = block->AllOps()[3];
+  ASSERT_EQ(grad_op2->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op2->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op2->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op2->Input("X"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op2->Input("H"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op2->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op2->Input("Z"), std::vector<std::string>({"z2"}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+  EXPECT_EQ(grad_op2->Input(f::GradVarName("Z")),
+            std::vector<std::string>({f::GradVarName("z2")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op2->Output(f::GradVarName("H")), std::vector<std::string>());
+
+  f::OpDescBind *fill_zero_op = block->AllOps()[4];
+  ASSERT_EQ(fill_zero_op->Type(), "fill_zeros_like");
+  ASSERT_EQ(fill_zero_op->InputNames().size(), 1UL);
+  ASSERT_EQ(fill_zero_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(fill_zero_op->Input("X"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(fill_zero_op->Output("Y"),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+
+  f::OpDescBind *grad_op1 = block->AllOps()[5];
+  ASSERT_EQ(grad_op1->Type(), "mult_in_out_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 6UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input("X"), std::vector<std::string>({"x1"}));
+  EXPECT_EQ(grad_op1->Input("H"), std::vector<std::string>({"h1"}));
+  EXPECT_EQ(grad_op1->Input("Y"), std::vector<std::string>({"y1"}));
+  EXPECT_EQ(grad_op1->Input("Z"), std::vector<std::string>({"z1"}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y1")}));
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Z")),
+            std::vector<std::string>({std::string("z1") + f::kZeroVarSuffix}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("H")),
+            std::vector<std::string>({f::GradVarName("h1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 4UL);
+  EXPECT_EQ(var_to_grad.at("y1"), f::GradVarInfo(f::GradVarName("y1"), 0, 3));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("h1"), f::GradVarInfo(f::GradVarName("h1"), 0, 5));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("h1")));
+}
+
+TEST(Backward, shared_var) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  f::OpDescBind *op1 = block->AppendOp();
+  op1->SetType("rowwise_add");
+  op1->SetInput("X", {"x1"});
+  op1->SetInput("b", {"b1"});
+  op1->SetOutput("Out", {"out1"});
+
+  f::OpDescBind *op2 = block->AppendOp();
+  op2->SetType("mul");
+  op2->SetInput("X", {"out1"});
+  op2->SetInput("Y", {"y2"});
+  op2->SetOutput("Out", {"out2"});
+
+  f::OpDescBind *op3 = block->AppendOp();
+  op3->SetType("rowwise_add");
+  op3->SetInput("X", {"out1"});
+  op3->SetInput("b", {"b3"});
+  op3->SetOutput("Out", {"out3"});
+
+  auto target = f::VarDescBind("out3");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {});
+
+  ASSERT_EQ(block->AllOps().size(), 8UL);
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+
+  f::OpDescBind *grad_op3 = block->AllOps()[4];
+  ASSERT_EQ(grad_op3->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op3->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op3->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op3->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out3")}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0"}));
+  EXPECT_EQ(grad_op3->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b3")}));
+
+  f::OpDescBind *grad_op4 = block->AllOps()[5];
+  ASSERT_EQ(grad_op4->Type(), "mul_grad");
+  ASSERT_EQ(grad_op4->InputNames().size(), 4UL);
+  ASSERT_EQ(grad_op4->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op4->Input("X"), std::vector<std::string>({"out1"}));
+  EXPECT_EQ(grad_op4->Input("Y"), std::vector<std::string>({"y2"}));
+  EXPECT_EQ(grad_op4->Input("Out"), std::vector<std::string>({"out2"}));
+  EXPECT_EQ(grad_op4->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out2")}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(grad_op4->Output(f::GradVarName("Y")),
+            std::vector<std::string>({f::GradVarName("y2")}));
+
+  f::OpDescBind *sum_op = block->AllOps()[6];
+  ASSERT_EQ(sum_op->Type(), "sum");
+  ASSERT_EQ(sum_op->InputNames().size(), 1UL);
+  ASSERT_EQ(sum_op->OutputNames().size(), 1UL);
+  EXPECT_EQ(sum_op->Input("X"),
+            std::vector<std::string>({f::GradVarName("out1") + "@RENAME@0",
+                                      f::GradVarName("out1") + "@RENAME@1"}));
+  EXPECT_EQ(sum_op->Output("Out"),
+            std::vector<std::string>({f::GradVarName("out1")}));
+
+  f::OpDescBind *grad_op1 = block->AllOps()[7];
+  ASSERT_EQ(grad_op1->Type(), "rowwise_add_grad");
+  ASSERT_EQ(grad_op1->InputNames().size(), 1UL);
+  ASSERT_EQ(grad_op1->OutputNames().size(), 2UL);
+  EXPECT_EQ(grad_op1->Input(f::GradVarName("Out")),
+            std::vector<std::string>({f::GradVarName("out1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("X")),
+            std::vector<std::string>({f::GradVarName("x1")}));
+  EXPECT_EQ(grad_op1->Output(f::GradVarName("b")),
+            std::vector<std::string>({f::GradVarName("b1")}));
+
+  EXPECT_EQ(var_to_grad.size(), 6UL);
+  EXPECT_EQ(var_to_grad.at("b3"), f::GradVarInfo(f::GradVarName("b3"), 0, 4));
+  EXPECT_EQ(var_to_grad.at("y2"), f::GradVarInfo(f::GradVarName("y2"), 0, 5));
+  EXPECT_EQ(var_to_grad.at("out1"),
+            f::GradVarInfo(f::GradVarName("out1"), 0, 6));
+  EXPECT_EQ(var_to_grad.at("x1"), f::GradVarInfo(f::GradVarName("x1"), 0, 7));
+  EXPECT_EQ(var_to_grad.at("b1"), f::GradVarInfo(f::GradVarName("b1"), 0, 7));
+
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b3")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("y2")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("out1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("x1")));
+  EXPECT_TRUE(block->HasVar(f::GradVarName("b1")));
+}
+
+TEST(Backward, half_backward) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+  auto *op1 = block->AppendOp();
+  op1->SetType("minus");
+  op1->SetInput("X", {"a"});
+  op1->SetInput("Y", {"b"});
+  op1->SetOutput("Out", {"out"});
+
+  auto target = f::VarDescBind("out");
+  size_t forward_len = block->AllOps().size();
+  auto var_to_grad = AppendBackward(program, target, {"b"});
+  f::OpDescBind *fill_op = block->AllOps()[forward_len];
+  EXPECT_EQ(fill_op->Type(), "fill_constant");
+  auto ops = block->AllOps();
+  ASSERT_EQ(3UL, ops.size());
+
+  EXPECT_EQ(var_to_grad.size(), 2UL);
+  EXPECT_EQ(var_to_grad.at("a"),
+            f::GradVarInfo(f::GradVarName("a"), 0, forward_len + 1));
+}
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 9570aedfdda332b797a8f348e0f6cf81bb2aee2f..251e340e6ddcc17ba16bdcab63f2a8c907122eab 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -18,22 +18,42 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-VarDescBind *BlockDescBind::NewVar(const std::string &name) {
-  need_update_ = true;
+VarDescBind *BlockDescBind::Var(const std::string &name) {
   auto it = vars_.find(name);
-  PADDLE_ENFORCE(it == vars_.end(), "Duplicated variable %s", name);
-  auto var = new VarDescBind(name);
+  if (it != vars_.end()) {
+    return it->second.get();
+  }
+  need_update_ = true;
+  auto *var = new VarDescBind(name);
   vars_[name].reset(var);
   return var;
 }
 
-VarDescBind *BlockDescBind::Var(const std::string &name) const {
+VarDescBind *BlockDescBind::FindVar(const std::string &name) const {
+  auto it = vars_.find(name);
+  if (it == vars_.end()) {
+    return nullptr;
+  }
+  return it->second.get();
+}
+
+bool BlockDescBind::HasVar(const std::string &name) const {
+  return vars_.find(name) != vars_.end();
+}
+
+VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
   auto it = vars_.find(name);
-  PADDLE_ENFORCE(it != vars_.end(),
-                 "Can not find variable %s in current block.", name);
+  if (it == vars_.end()) {
+    return Parent() == kNoneBlockIndex ? nullptr
+                                       : ParentBlock()->FindVarRecursive(name);
+  }
   return it->second.get();
 }
 
+bool BlockDescBind::HasVarRecursive(const std::string &name) const {
+  return FindVarRecursive(name) != nullptr;
+}
+
 std::vector<VarDescBind *> BlockDescBind::AllVars() const {
   std::vector<VarDescBind *> res;
   for (const auto &p : vars_) {
@@ -48,6 +68,11 @@ OpDescBind *BlockDescBind::AppendOp() {
   return ops_.back().get();
 }
 
+void BlockDescBind::AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc) {
+  need_update_ = true;
+  ops_.emplace_back(std::move(op_desc));
+}
+
 OpDescBind *BlockDescBind::PrependOp() {
   need_update_ = true;
   ops_.emplace_front(new OpDescBind());
@@ -62,28 +87,68 @@ std::vector<OpDescBind *> BlockDescBind::AllOps() const {
   return res;
 }
 
-void BlockDescBind::Sync() {
+void BlockDescBind::Flush() {
+  for (auto &op_desc : ops_) {
+    op_desc->Flush();
+  }
+
   if (need_update_) {
     auto &op_field = *this->desc_->mutable_ops();
-    op_field.Clear();
+    this->ClearPBOps();
     op_field.Reserve(static_cast<int>(ops_.size()));
     for (auto &op_desc : ops_) {
       op_field.AddAllocated(op_desc->Proto());
     }
+    auto &var_field = *this->desc_->mutable_vars();
+    this->ClearPBVars();
+    var_field.Reserve(static_cast<int>(vars_.size()));
+    for (auto &var_desc : vars_) {
+      var_field.AddAllocated(var_desc.second->Proto());
+    }
     need_update_ = false;
   }
 }
 
 BlockDescBind *BlockDescBind::ParentBlock() const {
-  if (this->desc_->parent_idx() == -1) {
+  if (this->desc_->parent_idx() == kNoneBlockIndex) {
     return nullptr;
   }
   return prog_->Block(static_cast<size_t>(this->desc_->parent_idx()));
 }
 
-void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
-  BlockDesc *desc = block.RawPtr();
-  this->attrs_[name] = desc;
+BlockDesc *BlockDescBind::Proto() {
+  Flush();
+  return desc_;
+}
+BlockDescBind::BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
+                             ProgramDescBind *prog)
+    : prog_(prog), desc_(desc) {
+  need_update_ = true;
+  for (auto &op : other.ops_) {
+    ops_.emplace_back(new OpDescBind(*op));
+  }
+
+  for (auto &it : other.vars_) {
+    auto *var = new VarDescBind(*it.second);
+    vars_[it.first].reset(var);
+  }
+}
+
+void BlockDescBind::ClearPBOps() {
+  auto ops = this->desc_->mutable_ops();
+  while (!ops->empty()) {
+    // we do not own the OpDesc, so release the ownership.
+    ops->ReleaseLast();
+  }
+}
+
+void BlockDescBind::ClearPBVars() {
+  auto vars = this->desc_->mutable_vars();
+  while (!vars->empty()) {
+    // we do not own the VarDesc, so release the ownership.
+    vars->ReleaseLast();
+  }
 }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 1a1135bab44cd27bb7d784c3b486188aa40635e4..c685050850dc25f346df49b5ce1d897974870460 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -15,10 +15,15 @@ limitations under the License. */
 #pragma once
 
 #include <deque>
+#include <memory>
+#include <set>
 #include <unordered_map>
 #include <vector>
+
 #include "paddle/framework/op_desc.h"
+#include "paddle/framework/proto_desc.h"
 #include "paddle/framework/var_desc.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -34,16 +39,35 @@ class BlockDescBind {
   BlockDescBind(ProgramDescBind *prog, BlockDesc *desc)
       : prog_(prog), desc_(desc), need_update_(false) {}
 
-  BlockDescBind(const BlockDescBind &o) = delete;
-  BlockDescBind &operator=(const BlockDescBind &o) = delete;
+  BlockDescBind(const BlockDescBind &other, BlockDesc *desc,
+                ProgramDescBind *prog);
+
+  ~BlockDescBind() {
+    this->ClearPBVars();
+    this->ClearPBOps();
+  }
 
   int32_t ID() const { return desc_->idx(); }
 
   int32_t Parent() const { return desc_->parent_idx(); }
 
-  VarDescBind *NewVar(const std::string &name_bytes);
+  VarDescBind *Var(const std::string &name_bytes);
+
+  VarDescBind *FindVar(const std::string &name_bytes) const;
+
+  bool HasVar(const std::string &var_name) const;
+
+  VarDescBind *FindVarRecursive(const std::string &name_bytes) const;
+
+  bool HasVarRecursive(const std::string &var_name) const;
 
-  VarDescBind *Var(const std::string &name_bytes) const;
+  std::set<std::string> LocalVarNames() const {
+    std::set<std::string> var_names;
+    for (auto &var : vars_) {
+      var_names.insert(var.first);
+    }
+    return var_names;
+  }
 
   std::vector<VarDescBind *> AllVars() const;
 
@@ -51,13 +75,23 @@ class BlockDescBind {
 
   OpDescBind *AppendOp();
 
+  void AppendAllocatedOp(std::unique_ptr<OpDescBind> &&op_desc);
+
   OpDescBind *PrependOp();
 
   std::vector<OpDescBind *> AllOps() const;
 
-  void Sync();
+  size_t OpSize() const { return ops_.size(); }
+
+  OpDescBind *Op(int idx) { return ops_.at(idx).get(); }
+
+  void Flush();
 
-  BlockDesc *RawPtr() { return desc_; }
+  BlockDesc *Proto();
+
+ private:
+  void ClearPBOps();
+  void ClearPBVars();
 
  private:
   ProgramDescBind *prog_;  // not_own
@@ -66,6 +100,8 @@ class BlockDescBind {
 
   std::deque<std::unique_ptr<OpDescBind>> ops_;
   std::unordered_map<std::string, std::unique_ptr<VarDescBind>> vars_;
+
+  DISABLE_COPY_AND_ASSIGN(BlockDescBind);
 };
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/data_type.h b/paddle/framework/data_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..c25a62c2b11ead614d93a4be8d63d40d0cc0165a
--- /dev/null
+++ b/paddle/framework/data_type.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <typeindex>
+#include "paddle/framework/framework.pb.h"
+
+namespace paddle {
+namespace framework {
+
+inline DataType ToDataType(std::type_index type) {
+  if (typeid(float).hash_code() == type.hash_code()) {
+    return DataType::FP32;
+  } else if (typeid(double).hash_code() == type.hash_code()) {
+    return DataType::FP64;
+  } else if (typeid(int).hash_code() == type.hash_code()) {
+    return DataType::INT32;
+  } else if (typeid(int64_t).hash_code() == type.hash_code()) {
+    return DataType::INT64;
+  } else {
+    PADDLE_THROW("Not supported");
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/details/op_registry.h b/paddle/framework/details/op_registry.h
new file mode 100644
index 0000000000000000000000000000000000000000..357ad21f39f3b1f6dbdb98063f8fb24ec6800ec6
--- /dev/null
+++ b/paddle/framework/details/op_registry.h
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_proto_maker.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/var_type_inference.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+enum OpInfoFillType {
+  kOperator = 0,
+  kOpProtoAndCheckerMaker = 1,
+  kGradOpDescMaker = 2,
+  kVarTypeInference = 3
+};
+
+template <typename T>
+struct OpInfoFillTypeID {
+  static constexpr OpInfoFillType ID() {
+    return std::is_base_of<OperatorBase, T>::value
+               ? kOperator
+               : (std::is_base_of<OpProtoAndCheckerMaker, T>::value
+                      ? kOpProtoAndCheckerMaker
+                      : (std::is_base_of<GradOpDescMakerBase, T>::value
+                             ? kGradOpDescMaker
+                             : (std::is_base_of<VarTypeInference, T>::value
+                                    ? kVarTypeInference
+                                    : static_cast<OpInfoFillType>(-1))));
+  }
+};
+
+template <typename T, OpInfoFillType = OpInfoFillTypeID<T>::ID()>
+struct OpInfoFiller;
+
+template <size_t I, bool at_end, typename... ARGS>
+class OperatorRegistrarRecursive;
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, false, ARGS...> {
+ public:
+  using T = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {
+    OpInfoFiller<T> fill;
+    fill(op_type, info);
+    constexpr auto size = sizeof...(ARGS);
+    OperatorRegistrarRecursive<I + 1, I + 1 == size, ARGS...> reg(op_type,
+                                                                  info);
+    (void)(reg);
+  }
+};
+
+template <size_t I, typename... ARGS>
+class OperatorRegistrarRecursive<I, true, ARGS...> {
+ public:
+  OperatorRegistrarRecursive(const char* op_type, OpInfo* info) {}
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOperator> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->creator_ = [](const std::string& type, const VariableNameMap& inputs,
+                        const VariableNameMap& outputs,
+                        const AttributeMap& attrs) {
+      return new T(type, inputs, outputs, attrs);
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kOpProtoAndCheckerMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->proto_ = new OpProto;
+    info->checker_ = new OpAttrChecker();
+    auto maker = T(info->proto_, info->checker_);
+    maker.Validate();
+    info->proto_->set_type(op_type);
+    PADDLE_ENFORCE(
+        info->proto_->IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, info->proto_->InitializationErrorString());
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kGradOpDescMaker> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->grad_op_maker_ = [](
+        const OpDescBind& fwd_op,
+        const std::unordered_set<std::string>& no_grad_set,
+        std::unordered_map<std::string, std::string>* grad_to_var) {
+      T maker(fwd_op, no_grad_set, grad_to_var);
+      return maker();
+    };
+  }
+};
+
+template <typename T>
+struct OpInfoFiller<T, kVarTypeInference> {
+  void operator()(const char* op_type, OpInfo* info) const {
+    info->infer_var_type_ = [](const OpDescBind& fwd_op, BlockDescBind* block) {
+      T inference;
+      inference(fwd_op, block);
+    };
+  }
+};
+
+}  // namespace details
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1f1e4edda823d62b169422672c855d96a2bd2ede
--- /dev/null
+++ b/paddle/framework/executor.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/executor.h"
+
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <set>
+#include <vector>
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/scope.h"
+
+namespace paddle {
+namespace framework {
+
+const std::string kFeedOpType = "feed";
+const std::string kFetchOpType = "fetch";
+
+Executor::Executor(const std::vector<platform::Place>& places) {
+  PADDLE_ENFORCE_GT(places.size(), 0);
+  device_contexts_.resize(places.size());
+  for (size_t i = 0; i < places.size(); i++) {
+    if (platform::is_cpu_place(places[i])) {
+      device_contexts_[i] = new platform::CPUDeviceContext(
+          boost::get<platform::CPUPlace>(places[i]));
+    } else if (platform::is_gpu_place(places[i])) {
+#ifdef PADDLE_WITH_CUDA
+      device_contexts_[i] = new platform::CUDADeviceContext(
+          boost::get<platform::GPUPlace>(places[i]));
+#else
+      PADDLE_THROW(
+          "'GPUPlace' is not supported, Please re-compile with WITH_GPU "
+          "option");
+#endif
+    }
+  }
+}
+
+Executor::~Executor() {
+  for (auto& device_context : device_contexts_) {
+    delete device_context;
+  }
+}
+
+void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id) {
+  // TODO(tonyyang-svail):
+  //    - only runs on the first device (i.e. no interdevice communication)
+  //    - will change to use multiple blocks for RNN op and Cond Op
+  PADDLE_ENFORCE_GT(pdesc.blocks_size(), block_id);
+  auto& block = pdesc.blocks(block_id);
+  auto& device = device_contexts_[0];
+
+  Scope& local_scope = scope->NewScope();
+
+  for (auto& var : block.vars()) {
+    if (var.persistable()) {
+      auto* ptr = scope->Var(var.name());
+      VLOG(3) << "Create Variable " << var.name()
+              << " global, which pointer is " << ptr;
+    } else {
+      auto* ptr = local_scope.Var(var.name());
+      VLOG(3) << "Create Variable " << var.name()
+              << " locally, which pointer is " << ptr;
+    }
+  }
+
+  for (auto& op_desc : block.ops()) {
+    auto op = paddle::framework::OpRegistry::CreateOp(
+        op_desc, const_cast<ProgramDesc*>(&pdesc));
+    op->Run(local_scope, *device);
+  }
+
+  scope->DeleteScope(&local_scope);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/executor.h b/paddle/framework/executor.h
new file mode 100644
index 0000000000000000000000000000000000000000..793ee954e25f7da6c9d04ea6acc2ad78812e8329
--- /dev/null
+++ b/paddle/framework/executor.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class Executor {
+ public:
+  explicit Executor(const std::vector<platform::Place>& places);
+  ~Executor();
+
+  /* @Brief
+   * Runtime evaluation of the given ProgramDesc under certain Scope
+   *
+   * @param
+   *  ProgramDesc
+   *  Scope
+   */
+  void Run(const ProgramDesc&, Scope*, int);
+
+ private:
+  std::vector<platform::DeviceContext*> device_contexts_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_method.h b/paddle/framework/feed_fetch_method.h
new file mode 100644
index 0000000000000000000000000000000000000000..7feacb1e24708411e7fbb610f9909447cba9e291
--- /dev/null
+++ b/paddle/framework/feed_fetch_method.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+void SetFeedVariable(Scope* scope, const LoDTensor& input,
+                     const std::string& var_name, size_t index) {
+  // If var_name Variable is not found in GlobalScope, a new variable will
+  // be created.
+  VLOG(3) << "SetFeedVariable name=" << var_name << " index=" << index;
+  Variable* g_feed_value = scope->Var(var_name);
+  auto& feed_inputs =
+      *(g_feed_value->GetMutable<std::vector<paddle::framework::LoDTensor>>());
+  if (index >= feed_inputs.size()) {
+    feed_inputs.resize(index + 1);
+  }
+  // shared data with input tensor
+  feed_inputs[index].ShareDataWith(input);
+  // set lod
+  feed_inputs[index].set_lod(input.lod());
+}
+
+LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name,
+                            size_t index) {
+  // Since we want to fetch LodTensor from a variable, the variable must
+  // be created alreadly.
+  Variable* g_fetch_value = scope.FindVar(var_name);
+  PADDLE_ENFORCE(g_fetch_value->IsType<FeedFetchList>(),
+                 "Only %s can be invoked by GetFetchVariable",
+                 typeid(FeedFetchList).name());
+  auto& fetch_outputs = *g_fetch_value->GetMutable<FeedFetchList>();
+  auto& tensor = fetch_outputs[index];
+  VLOG(3) << "Fetch " << var_name << " with index " << index
+          << " shape= " << tensor.dims();
+  PADDLE_ENFORCE_LT(index, fetch_outputs.size());
+  return tensor;
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/feed_fetch_type.h b/paddle/framework/feed_fetch_type.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc4ae440fc708f696c18bb9d5ab3ba7dd59e21ab
--- /dev/null
+++ b/paddle/framework/feed_fetch_type.h
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+using FeedFetchType = LoDTensor;
+using FeedFetchList = std::vector<FeedFetchType>;
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 951c7afbc14e2d9119169c1351d38ff0b67bdc5b..8f2df3dc0e29f96b3aea58b6761d1ccb4cd7c624 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
 package paddle.framework;
 
 enum AttrType {
@@ -54,6 +55,7 @@ message OpDesc {
   repeated Var inputs = 1;
   repeated Var outputs = 2;
   repeated Attr attrs = 4;
+  optional bool is_target = 5 [ default = false ];
 };
 
 // OpProto describes a C++ framework::OperatorBase derived class.
@@ -66,7 +68,7 @@ message OpProto {
 
     optional bool duplicable = 3 [ default = false ];
     optional bool intermediate = 4 [ default = false ];
-    optional bool not_in_gradient = 5 [ default = false ];
+    optional bool dispensable = 5 [ default = false ];
   }
 
   // AttrProto describes the C++ type Attribute.
@@ -97,15 +99,29 @@ enum DataType {
   FP64 = 6;
 }
 
-message LoDTensorDesc {
+message TensorDesc {
   required DataType data_type = 1;
   repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
-  optional int32 lod_level = 3 [ default = 0 ];
+}
+
+message LoDTensorDesc {
+  required TensorDesc tensor = 1;
+  optional int32 lod_level = 2 [ default = 0 ];
 }
 
 message VarDesc {
+  enum VarType {
+    LOD_TENSOR = 1;
+    SELECTED_ROWS = 2;
+    FEED_MINIBATCH = 3;
+    FETCH_LIST = 4;
+    STEP_SCOPES = 5;
+  }
   required string name = 1;
-  optional LoDTensorDesc lod_tensor = 2;
+  required VarType type = 2;
+  optional LoDTensorDesc lod_tensor = 3;
+  optional TensorDesc selected_rows = 4;
+  optional bool persistable = 5 [ default = false ];
 }
 
 message BlockDesc {
@@ -115,4 +131,7 @@ message BlockDesc {
   repeated OpDesc ops = 4;
 }
 
+// Please refer to
+// https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/program.md
+// for more details.
 message ProgramDesc { repeated BlockDesc blocks = 1; }
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
deleted file mode 100644
index b02a599a800668b22e7fe39a10fa6dc132e305bd..0000000000000000000000000000000000000000
--- a/paddle/framework/grad_op_builder.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOpArgType::OUT WARRANTIES OR CONDITIONS OF ANY KOpArgType::IND, either
-express or implied. See the License for the specific language governing
-permissions and limitations under the License. */
-
-#include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace framework {
-enum class OpArgType { IN, OUT };
-
-static void TransOpArg(const OperatorBase* src_op, const OpArgType& src_type,
-                       bool is_grad, VariableNameMap* vars) {
-  const auto& src_inout =
-      src_type == OpArgType::IN ? src_op->Inputs() : src_op->Outputs();
-  auto& dst_inout = *vars;
-  auto& proto = OpInfoMap::Instance().Get(src_op->Type()).Proto();
-  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
-  for (const auto& arg : src_arg_list) {
-    if (arg.not_in_gradient() && !is_grad) continue;
-    const std::string src_name = arg.name();
-    std::string dst_name = is_grad ? GradVarName(src_name) : src_name;
-    dst_inout[dst_name].reserve(src_inout.at(src_name).size());
-    for (auto& var_name : src_inout.at(src_name)) {
-      std::string s = is_grad ? GradVarName(var_name) : var_name;
-      dst_inout[dst_name].emplace_back(s);
-    }
-  }
-}
-
-OperatorBase* BuildGradOp(const OperatorBase* op) {
-  auto& info = OpInfoMap::Instance().Get(op->Type());
-  PADDLE_ENFORCE(info.HasGradientOp());
-
-  VariableNameMap inputs;
-  VariableNameMap outputs;
-  TransOpArg(op, OpArgType::IN, false, &inputs);   // I
-  TransOpArg(op, OpArgType::OUT, false, &inputs);  // O
-  TransOpArg(op, OpArgType::OUT, true, &inputs);   // OG
-  TransOpArg(op, OpArgType::IN, true, &outputs);   // IG
-
-  auto& grad_info = OpInfoMap::Instance().Get(info.grad_op_type_);
-  return grad_info.Creator()(info.grad_op_type_, inputs, outputs, op->Attrs());
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
deleted file mode 100644
index 9e3ca563c6765637f8471d142d32cec447f0b977..0000000000000000000000000000000000000000
--- a/paddle/framework/grad_op_builder_test.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-#include "paddle/framework/grad_op_builder.h"
-#include <gtest/gtest.h>
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
-USE_OP(add);
-
-namespace paddle {
-namespace framework {
-
-class MutiInOutOpMaker : public OpProtoAndCheckerMaker {
- public:
-  MutiInOutOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable();
-    AddInput("In3", "another single input");
-    AddOutput("Out1", "a single output");
-    AddOutput("Out2_mult", "a multiple output").AsDuplicable();
-    AddComment("test op with multiple inputs and outputs");
-  }
-};
-
-class IOIgnoredOpMaker : public OpProtoAndCheckerMaker {
- public:
-  IOIgnoredOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("In1", "a single input");
-    AddInput("In2_mult", "a multiple input").AsDuplicable().NotInGradient();
-    AddInput("In3_mult", "another multiple input").AsDuplicable();
-    AddOutput("Out1_mult", "a multiple output").AsDuplicable();
-    AddOutput("Out2", "a single output").NotInGradient();
-    AddComment("op with inputs and outputs ignored in gradient calculating");
-  }
-};
-
-}  // namespace framework
-}  // namespace paddle
-
-namespace f = paddle::framework;
-
-TEST(GradOpBuilder, AddTwo) {
-  std::shared_ptr<f::OperatorBase> add_op(f::OpRegistry::CreateOp(
-      "add", {{"X", {"x"}}, {"Y", {"y"}}}, {{"Out", {"out"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_add_op =
-      f::OpRegistry::CreateGradOp(*add_op);
-  EXPECT_EQ(grad_add_op->Inputs().size(), 4UL);
-  EXPECT_EQ(grad_add_op->Outputs().size(), 2UL);
-  EXPECT_EQ(grad_add_op->Input("X"), "x");
-  EXPECT_EQ(grad_add_op->Input("Y"), "y");
-  EXPECT_EQ(grad_add_op->Input("Out"), "out");
-  EXPECT_EQ(grad_add_op->Input(f::GradVarName("Out")), f::GradVarName("out"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("X")), f::GradVarName("x"));
-  EXPECT_EQ(grad_add_op->Output(f::GradVarName("Y")), f::GradVarName("y"));
-}
-
-REGISTER_OP(mult_io, f::NOP, f::MutiInOutOpMaker, mult_io_grad, f::NOP);
-REGISTER_OP(io_ignored, f::NOP, f::IOIgnoredOpMaker, io_ignored_grad, f::NOP);
-
-TEST(GradOpBuilder, MutiInOut) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "mult_io", {{"In1", {"in1"}},
-                  {"In2_mult", {"in2_1", "in2_2", "in2_3"}},
-                  {"In3", {"in3"}}},
-      {{"Out1", {"out1"}}, {"Out2_mult", {"out2_1", "out2_2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-
-  ASSERT_EQ(grad_test_op->Inputs().size(), 3UL + 2UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In2_mult"),
-            std::vector<std::string>({"in2_1", "in2_2", "in2_3"}));
-  EXPECT_EQ(grad_test_op->Input("In3"), "in3");
-  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
-            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
-            f::GradVarName("out1"));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));
-
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>({f::GradVarName("in2_1"),
-                                      f::GradVarName("in2_2"),
-                                      f::GradVarName("in2_3")}));
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
-}
-
-TEST(GradOpBuilder, IOIgnoredInGradient) {
-  std::shared_ptr<f::OperatorBase> test_op(f::OpRegistry::CreateOp(
-      "io_ignored", {{"In1", {"in1"}},
-                     {"In2_mult", {"in2_1", "in2_2"}},
-                     {"In3_mult", {"in3_1", "in3_2"}}},
-      {{"Out1_mult", {"out1_1", "out1_2"}}, {"Out2", {"out2"}}}, {}));
-  std::shared_ptr<f::OperatorBase> grad_test_op =
-      f::OpRegistry::CreateGradOp(*test_op);
-
-  // 'In2' and 'Out2' are ignored in gradient calculating
-  ASSERT_EQ(grad_test_op->Inputs().size(), 2UL + 1UL + 2UL);
-  EXPECT_EQ(grad_test_op->Input("In1"), "in1");
-  EXPECT_EQ(grad_test_op->Inputs("In3_mult"),
-            std::vector<std::string>({"in3_1", "in3_2"}));
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
-            std::vector<std::string>({"out1_1", "out1_2"}));
-  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
-  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
-            f::GradVarName("out2"));
-
-  ASSERT_EQ(grad_test_op->Outputs().size(), 3UL);
-  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
-  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
-            std::vector<std::string>(
-                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
-}
diff --git a/paddle/framework/grad_op_desc_maker.h b/paddle/framework/grad_op_desc_maker.h
new file mode 100644
index 0000000000000000000000000000000000000000..94944c79b64d38e799df436de874cabc3661e30a
--- /dev/null
+++ b/paddle/framework/grad_op_desc_maker.h
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <string>
+#include <unordered_set>
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+class GradOpDescMakerBase {
+ public:
+  explicit GradOpDescMakerBase(
+      const OpDescBind& fwd_op,
+      const std::unordered_set<std::string>& no_grad_set,
+      std::unordered_map<std::string, std::string>* grad_to_var)
+      : fwd_op_(fwd_op), no_grad_set_(no_grad_set), grad_to_var_(grad_to_var) {}
+
+  virtual ~GradOpDescMakerBase() = default;
+  virtual std::vector<std::unique_ptr<OpDescBind>> operator()() const = 0;
+
+ protected:
+  std::vector<std::string> InputGrad(const std::string& name,
+                                     bool drop_empty_grad = true) const {
+    std::vector<std::string> ret_val;
+    auto var_names = this->Input(name);
+    ret_val.reserve(var_names.size());
+    std::transform(var_names.begin(), var_names.end(),
+                   std::back_inserter(ret_val),
+                   [this](const std::string& fwd_var_name) -> std::string {
+                     auto g_name = GradVarName(fwd_var_name);
+                     if (no_grad_set_.count(g_name)) {
+                       return kEmptyVarName;
+                     } else {
+                       (*this->grad_to_var_)[g_name] = fwd_var_name;
+                       return g_name;
+                     }
+                   });
+    if (!drop_empty_grad) {
+      return ret_val;
+    }
+    std::vector<std::string> dropped_ret_val;
+    dropped_ret_val.reserve(ret_val.size());
+    std::copy_if(ret_val.begin(), ret_val.end(),
+                 std::back_inserter(dropped_ret_val),
+                 [](const std::string& str) { return str != kEmptyVarName; });
+    return dropped_ret_val;
+  }
+
+  std::vector<std::string> OutputGrad(const std::string& name) const {
+    std::vector<std::string> ret_val;
+    auto onames = this->Output(name);
+    ret_val.reserve(onames.size());
+    std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
+                   GradVarName);
+    return ret_val;
+  }
+
+  std::vector<std::string> InputNames() const {
+    return this->fwd_op_.InputNames();
+  }
+
+  std::vector<std::string> OutputNames() const {
+    return this->fwd_op_.OutputNames();
+  }
+
+  std::vector<std::string> Input(const std::string& name) const {
+    return fwd_op_.Input(name);
+  }
+
+  std::vector<std::string> Output(const std::string& name) const {
+    return fwd_op_.Output(name);
+  }
+
+  const std::unordered_map<std::string, Attribute>& Attrs() const {
+    return fwd_op_.GetAttrMap();
+  }
+
+  const Attribute& GetAttr(const std::string& name) const {
+    auto& map = fwd_op_.GetAttrMap();
+    auto it = map.find(name);
+    PADDLE_ENFORCE(it != map.end(), "Cannot find attribute %s", name);
+    return it->second;
+  }
+
+  std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+
+ private:
+  const OpDescBind& fwd_op_;
+  const std::unordered_set<std::string>& no_grad_set_;
+  std::unordered_map<std::string, std::string>* grad_to_var_;
+};
+
+class SingleGradOpDescMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const {
+    std::vector<std::unique_ptr<OpDescBind>> retv;
+    retv.emplace_back(this->Apply());
+    return retv;
+  }
+
+ protected:
+  virtual std::unique_ptr<OpDescBind> Apply() const = 0;
+};
+
+template <bool DropEmptyIG = true>
+class DefaultGradOpDescMaker : public SingleGradOpDescMaker {
+ public:
+  using SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  virtual std::unique_ptr<OpDescBind> Apply() const {
+    auto* grad = new OpDescBind();
+    grad->SetType(this->GradOpType());
+
+    for (auto& input_param : this->InputNames()) {
+      grad->SetInput(input_param, this->Input(input_param));
+      grad->SetOutput(GradVarName(input_param),
+                      this->InputGrad(input_param, DropEmptyIG));
+    }
+
+    for (auto& output_param : this->OutputNames()) {
+      grad->SetInput(output_param, this->Output(output_param));
+      grad->SetInput(GradVarName(output_param), this->OutputGrad(output_param));
+    }
+
+    grad->SetAttrMap(this->Attrs());
+
+    return std::unique_ptr<OpDescBind>(grad);
+  }
+
+  virtual std::string GradOpType() const {
+    return this->ForwardOpType() + "_grad";
+  }
+};
+
+class EmptyGradOpMaker : public GradOpDescMakerBase {
+ public:
+  using GradOpDescMakerBase::GradOpDescMakerBase;
+  std::vector<std::unique_ptr<OpDescBind>> operator()() const override {
+    return {};
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 5b7badf89c1714331bae9fc8cf94c8da2c66dbad..f53dd1c1858b45d39692eb683bc1dd9ee75b88fb 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,15 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/saver.pb.h"
+
+#include "paddle/memory/memcpy.h"
+#include "paddle/memory/memory.h"
+
+#include <stdint.h>
+#include <string.h>
+#include <algorithm>
+#include <iterator>
 
 #include <glog/logging.h>
 
@@ -25,31 +34,50 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   for (size_t i = level_begin; i < level_end; i++) {
     new_lod.emplace_back(in.at(i));
   }
+  // transform the lowest level to absolute offset.
+  LoD abs_offset_lod = ToAbsOffset(in);
+  new_lod.back() = abs_offset_lod[level_end - 1];
   return new_lod;
 }
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end) {
-  // slice the lod.
-  LoD new_lod;
-  new_lod.reserve(in.size() - level);
-  auto start = in.at(level)[elem_begin];
-  auto end = in.at(level)[elem_end];
-
-  for (auto it = in.begin() + level; it != in.end(); it++) {
-    auto it_begin = std::find(it->begin(), it->end(), start);
-    auto it_end = std::find(it_begin, it->end(), end);
-    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
-    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
-    new_lod.emplace_back(it_begin, it_end + 1);
-    // reset offset if tensor is copyed and sliced.
-    std::transform(new_lod.back().begin(), new_lod.back().end(),
-                   new_lod.back().begin(),
-                   [start](int v) { return v - start; });
-    PADDLE_ENFORCE_EQ(new_lod.back().front(), 0, "error in slice LoD");
+  PADDLE_ENFORCE_LT(level, in.size());
+  PADDLE_ENFORCE_LT(elem_end, in[level].size());
+
+  LoD res;
+  res.resize(in.size() - level);
+  // copy the first level
+  res[0].assign(in[level].begin() + elem_begin,
+                in[level].begin() + elem_end + 1);
+  for (size_t lvl = 1; lvl < res.size(); lvl++) {
+    const auto& in_level = in[level + lvl];
+    const auto& above_level = res[lvl - 1];
+    auto& out_level = res[lvl];
+    out_level.assign(in_level.begin() + above_level.front(),
+                     in_level.begin() + above_level.back() + 1);
   }
-  PADDLE_ENFORCE_LE(new_lod.size(), in.size());
-  return new_lod;
+  for (size_t lvl = 0; lvl < res.size(); lvl++) {
+    // to make the first offset equals 0, all the elements minus the first
+    // element
+    size_t front = res[lvl].front();
+    for (auto& ele : res[lvl]) {
+      ele -= front;
+    }
+  }
+  return res;
+}
+
+LoD ToAbsOffset(const LoD& in) {
+  // the lowest level stores relative offsets
+  if (in.empty() || in.size() == 1) return in;
+  LoD result = in;
+  for (int level = result.size() - 2; level >= 0; level--) {
+    for (auto& ele : result[level]) {
+      ele = result[level + 1][ele];
+    }
+  }
+  return result;
 }
 
 bool operator==(const LoD& a, const LoD& b) {
@@ -75,17 +103,7 @@ bool operator==(const LoD& a, const LoD& b) {
 size_t LoDTensor::NumElements(size_t level, size_t idx) const {
   PADDLE_ENFORCE_LT(level, NumLevels());
   PADDLE_ENFORCE_LT(idx, NumElements(level));
-  // the last level of LoD, just return number of records in Tensor
-  if (level == NumLevels() - 1) {
-    return lod_[level][idx + 1] - lod_[level][idx];
-  }
-  // high level of LoD, and there is another lower level, return number of
-  // lower-level elements
-  auto tmp = SliceInLevel(lod_, level, idx, idx + 1);
-  PADDLE_ENFORCE_GE(tmp.size(), 2);
-  // there is a 0 as a placeholder stored in LoD, so the number of elements
-  // equals lod.size() - 1
-  return tmp[1].size() - 1;
+  return lod_[level][idx + 1] - lod_[level][idx];
 }
 
 void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
@@ -103,5 +121,140 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
   lod_ = new_lod;
 }
 
+std::string LoDTensor::SerializeToString() const {
+  LoDTensorProto desc;
+
+  // set data_type
+  if (this->type() == typeid(int8_t)) desc.set_data_type(DataType::BOOL);
+  if (this->type() == typeid(int16_t)) desc.set_data_type(DataType::INT16);
+  if (this->type() == typeid(int32_t)) desc.set_data_type(DataType::INT32);
+  if (this->type() == typeid(int64_t)) desc.set_data_type(DataType::INT64);
+  // FIXME(dzh): there is no fp16 in standard c++
+
+  if (this->type() == typeid(float))  // NOLINT
+    desc.set_data_type(DataType::FP32);
+  if (this->type() == typeid(double))  // NOLINT
+    desc.set_data_type(DataType::FP64);
+
+  for (int i = 0; i < dims().size(); ++i) {
+    desc.add_dims(dims()[i]);
+  }
+
+  // set lod information
+  desc.set_lod_level(this->NumLevels());
+  for (size_t i = 0; i < this->NumLevels(); ++i) {
+    LoDInfo* lod = desc.add_levels();
+    for (size_t j = 0; j < lod_[i].size(); ++j) {
+      lod->add_level(lod_[i][j]);
+    }
+  }
+
+  desc.set_version(0);
+
+  std::string desc_bytes = desc.SerializeAsString();
+
+  // FIXME(dzh) : implement fix chunk size buffer.
+  size_t DESC_SIZE = desc_bytes.size();
+  size_t DATA_SIZE = holder_->size() - offset_;
+
+  const size_t BUFFER_SIZE = DESC_SIZE + DATA_SIZE + 2 * sizeof(size_t);
+  char* buffer =
+      static_cast<char*>(memory::Alloc(platform::CPUPlace(), BUFFER_SIZE));
+
+  // format: desc_size data_size, desc_bytes, data_bytes.
+  platform::CPUPlace src_place;
+  platform::CPUPlace dst_place;
+
+  memory::Copy(dst_place, buffer, src_place, &BUFFER_SIZE, sizeof(size_t));
+  memory::Copy(dst_place, buffer + sizeof(size_t), src_place, &DESC_SIZE,
+               sizeof(size_t));
+  memory::Copy(dst_place, buffer + sizeof(size_t) * 2, src_place,
+               desc_bytes.c_str(), desc_bytes.size());
+
+  PADDLE_ENFORCE(this->numel() != 0, "Serialize a empty Tensor!");
+
+  platform::Place place = holder_->place();
+  int element_width = holder_->size() / this->numel();
+
+  if (platform::is_cpu_place(place)) {
+    memory::Copy(dst_place, buffer + sizeof(size_t) * 2 + desc_bytes.size(),
+                 boost::get<platform::CPUPlace>(place),
+                 static_cast<char*>(holder_->ptr()) + offset_ / element_width,
+                 DATA_SIZE);
+  }
+#ifdef PADDLE_WITH_GPU
+  if (platform::is_gpu_place(place)) {
+    memory::Copy(dst_place, buffer + sizeof(size_t) * 2 + desc_bytes.size(),
+                 boost::get<platform::GPUPlace>(place),
+                 static_cast<char*>(holder_->ptr()) + offset_ / element_width,
+                 DATA_SIZE);
+  }
+#endif
+
+  std::string ret(buffer, BUFFER_SIZE);
+  memory::Free(platform::CPUPlace(), buffer);
+  return ret;
+}
+
+void LoDTensor::DeserializeFromString(const std::string& s,
+                                      const platform::Place& dst_place) {
+  size_t DESC_SIZE, BUFFER_SIZE;
+  platform::CPUPlace src_place;
+
+  memory::Copy(src_place, &BUFFER_SIZE, src_place, s.c_str(), sizeof(size_t));
+  memory::Copy(src_place, &DESC_SIZE, src_place, s.c_str() + sizeof(size_t),
+               sizeof(size_t));
+
+  const size_t DATA_SIZE = BUFFER_SIZE - DESC_SIZE - sizeof(size_t) * 2;
+
+  // parse LoDTensorDesc
+  LoDTensorProto desc;
+  desc.ParseFromArray(s.c_str() + sizeof(size_t) * 2, DESC_SIZE);
+
+  std::vector<int64_t> dims;
+  std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+  this->Resize(make_ddim(dims));
+
+  // parse data type
+  void* ptr = nullptr;
+  if (desc.data_type() == DataType::BOOL)
+    ptr = this->mutable_data<bool>(dst_place);
+  if (desc.data_type() == DataType::INT16)
+    ptr = this->mutable_data<int16_t>(dst_place);
+  if (desc.data_type() == DataType::INT32)
+    ptr = this->mutable_data<int32_t>(dst_place);
+  if (desc.data_type() == DataType::INT64)
+    ptr = this->mutable_data<int64_t>(dst_place);
+  // FIXME(dzh): there is no fp16 in standard c++
+
+  if (desc.data_type() == DataType::FP32)
+    ptr = this->mutable_data<float>(dst_place);
+  if (desc.data_type() == DataType::FP64)
+    ptr = this->mutable_data<double>(dst_place);
+
+  LoD lod;
+  std::vector<size_t> levels;
+  for (int i = 0; i < desc.levels().size(); ++i) {
+    auto current_level = desc.levels()[i].level();
+    std::copy(current_level.begin(), current_level.end(),
+              std::back_inserter(levels));
+    lod.emplace_back(levels);
+    levels.clear();
+  }
+
+  this->set_lod(lod);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), ptr, src_place,
+                 s.c_str() + sizeof(size_t) * 2 + DESC_SIZE, DATA_SIZE);
+  }
+#ifdef PADDLE_WITH_GPU
+  if (platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), ptr, src_place,
+                 s.c_str() + sizeof(size_t) * 2 + DESC_SIZE, DATA_SIZE);
+  }
+#endif
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 49786a4a6635f1b39356dbf9633c4e7da443f04e..f78a751c53621aa103026b5d8a251966685822bb 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -15,7 +15,7 @@
 #pragma once
 
 #include <memory>
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/system/cuda/experimental/pinned_allocator.h>
@@ -25,11 +25,12 @@
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
 
 namespace paddle {
 namespace framework {
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 template <typename T>
 using Vector = std::vector<T>;
 #else
@@ -39,23 +40,36 @@ using Vector = thrust::host_vector<
 #endif
 
 /*
- * 3-level LoD stores
+ * LoD is short for Level of Details.
  *
- * 0 10 20
- * 0 5 10 15 20
- * 0 2 5 7 10 12 15 20
- *
- * - in a level, each element indicates offset in the underlying Tensor
+ * - in a level, each element indicates relative offset of the lower level
  * - the first element should be 0 and that indicates that this sequence start
  * from 0
  * - each sequence's begin and end(no-inclusive) is level[id, id+1]
+ *
+ * For example:
+ *    3-level LoD stores
+ *
+ *    0 2 3
+ *    0 2 4 7
+ *    0 2 5 7 10 12 15 20
  */
 using LoD = std::vector<Vector<size_t>>;
 
+/*
+ * Slice levels from a LoD.
+ * NOTE the lowest level should always be the absolute offsets of the underlying
+ * tensor instances. So if higher layers are sliced without the lowest level,
+ * the lower level of the sliced LoD will be transformed to the absolute offset.
+ */
 LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
 
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end);
+/*
+ * Transform an LoD from relative offsets to absolute offsets.
+ */
+LoD ToAbsOffset(const LoD& in);
 
 bool operator==(const LoD& a, const LoD& b);
 
@@ -74,12 +88,12 @@ class LoDTensor : public Tensor {
   LoD lod() const { return lod_; }
 
   /*
-   * Get a element from LoD.
+   * Get the start offset and end offset of an  element from LoD.
    */
-  size_t lod_element(size_t level, size_t elem) const {
+  std::pair<size_t, size_t> lod_element(size_t level, size_t elem) const {
     PADDLE_ENFORCE_LT(level, NumLevels());
     PADDLE_ENFORCE_LT(elem, NumElements(level));
-    return (lod_)[level][elem];
+    return std::make_pair((lod_)[level][elem], (lod_)[level][elem + 1]);
   }
 
   /*
@@ -119,6 +133,27 @@ class LoDTensor : public Tensor {
    */
   void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
 
+  /**
+   *  @brief Serialize tensor to char bytes.
+   *  Please check model_format.md for the format detail.
+   *  NOTE: GPUTensor will copy data to cpu implicitly.
+   *  @return return string
+   */
+
+  // FIXME(dzh) : Currently, this interface should only be used in
+  // save/restore model and checkpoint. ParameterServer do not use shape
+  // information to do the optimization, as a result, when we serialize
+  // parameter/gradient to string, we should serialize the tensor
+  // to string in the ps trainer instead of LoDTensor.
+  std::string SerializeToString() const;
+
+  /**
+   *  @brief Deserialize char bytes to tensor.
+   *  @return return string
+   */
+  void DeserializeFromString(const std::string& s,
+                             const platform::Place& dst_place);
+
  private:
   LoD lod_;
 };
diff --git a/paddle/framework/lod_tensor.md b/paddle/framework/lod_tensor.md
index 07bbdf9416c432052b3222757a61ac4bfd70fe14..d147f1c4257eec14664301edab8d1fe2f128d2b0 100644
--- a/paddle/framework/lod_tensor.md
+++ b/paddle/framework/lod_tensor.md
@@ -1,147 +1,175 @@
 # Design Doc: LoD (Level-of-Detail) Tensor
 
-PaddlePaddle's RNN doesn't require that all instances have the same length.  To do so, we introduce an extension to Tensor, namely, LoD Tensor.
+Like other deep learning systems, PaddlePaddle supports training models from sequence data.  Also, like other systems, PaddlePaddle represent a mini-batch of sequences as a Tensor.  What is different is that PaddlePaddle doesn't require all sequences in a mini-batch to be of the same length. Thus no need for padding zeros.
 
-## Challenge of Variable-length Inputs
+|                       | TensorFlow | PaddlePaddle |
+|-----------------------|------------|--------------|
+| RNN                   | Support    | Support      |
+| recursive RNN         | Support    | Support      |
+| padding zeros         | Must       | No need      |
+| blob data type        | Tensor     | LoDTensor    |
 
-People usually represent a mini-batch by a Tensor. For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  So a transformation, T, of all images can be a matrix multiplication of the 10xOx32-dimensional tensor T and the 10x32x32 Tensor.
+PaddlePaddle achieves this flexibility by passing through a new data type, *LoD Tensor*, which is a Tensor attached with segmentation index known as *LoD*, between operators.  The LoD index doesn't only segment a tensor, but also recursively segments sub-sequences.  This document presents the design of LoD and LoDTensor.
 
-Another example is that each mini-batch contains 32 sentences, where each word is a D-dimensional one-hot vector.  If all sentences have the same length L, we can represent this mini-batch by a 32xLxD tensor.  However, in most cases, sentences have variable lengths, and we will need an index data structure to record these variable lengths.
 
-## LoD as a Solution
+## The Challenge: Variable-length Sequences
 
-### Mini-Batch of variable-length sentences
+Most deep learning systems represent a mini-batch as a Tensor.  For example, a mini-batch of 10 images, each of size 32x32, is a 10x32x32 Tensor.  Another example is that each mini-batch contains N sentences, where each word is a D-dimensional one-hot vector.  Suppose that all sentences have the same length L, we can represent this mini-batch by a NxLxD tensor.
 
-Let's imagine a mini-batch of 3 variable lengths sentences, containing 3, 1, and 2 words respectively.  We can represent it by a (3+1+2)xD tensor plus some index information:
+Both examples show that the elements of sequences are usually of the same size.  In the first example, all images are 32x32, and in the second one, all words are D-dimensional vectors.  It doesn't make sense to allow variable-sized images, as that would require transformations like convolution to handle variable-sized Tensors.
+
+The real challenge is that in most cases, sentences have variable lengths, and we will need an index data structure to segment the tensor into sequences.  Also, sequences might consist of sub-sequences.
+
+
+## A Solution: The LoD Index
+
+To understand our solution, it is best to look at some examples.
+
+### A Mini-Batch of Sentences
+
+Let's imagine a mini-batch of 3 variable lengths sentences composed of 3, 1, and 2 words, respectively.  We can represent the mini-batch by a (3+1+2)xD tensor plus some index information:
 
 ```
-   3
 3   1 2
 ||| | ||
 ```
 
-Each `|` represents a D-dimensional word vectors.  The number 3 on top indicate 3 sentences, and numbers 3, 1, and 2 on the second level represent the number of words in each sentence.
+where each `|` represents a D-dimensional word vector.  The numbers, 3, 1, and 2, form a 1-level LoD.
+
+### Recursive Sequences
+
+Let check another example of a 2-level LoD Tensor.  Consider a mini-batch of three articles with 3, 1, and 2 sentences, and each sentence consists of a variable number of words:
+
+```
+3           1  2
+3   2  4    1  2  3
+||| || |||| |  || |||
+```
 
-### Mini-Batch of variable-length videos
+### A Mini-Batch of Videos
 
-This approach generalizes to the case where elements are not words, but higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  If a mini-batch contains 3 videos of 3, 1, and 2 frames respectively.  The underlying tensor is of size (3+1+2)x640x480.  The index information illustrates as:
+LoD tensors generalize to the case where elements are higher dimensional objects, like images.  Suppose that a mini-batch contains videos of the same frame size 640x480.  Here is a mini-batch of 3 videos with 3, 1, and 2 frames, respectively.
 
 ```
-     3
 3     1  2
 口口口 口 口口
 ```
 
-where each `口` represents an image.
+The underlying tensor is of size (3+1+2)x640x480, and each `口` represents a 640x480 image.
 
-### Mini-Batch of fixed-size images
+### A Mini-Batch of Images
 
-Let's get back to a typical example, image classification, where each mini-batch has M fixed-sized images.  The LoD Tensor representation is
+In traditional cases like a mini-batch with N fixed-sized images,  the LoD Tensor representation is as
 
 ```
-     M
 1 1 1 1     1
 口口口口 ... 口
 ```
 
-The many 1's on the second level seem duplicated.  For this particular case of 2 levels and the second level always have length 1, we can ignore the LoD index.
-
-### Design and summarization
+In this case, we don't lose any information by ignoring the many 1's in the index and simply considering this LoD Tensor as a usual Tensor:
 
-In summary, as long as that the essential elements (words  or images) have the same size, we can represent mini-batches by a LoD Tensor:
+```
+口口口口 ... 口
+```
 
-- The underlying tensor has size LxD1xD2x..., where D1xD2... is the size of the essential elements, and
-- The first dimension size L has an additonal property -- a LoD index as a nested vector:
+### Model Parameters
 
-  ```c++
-  typedef std::vector<std::<vector>> LoD;
-  ```
+A model parameter is just a usual Tensor, which, just like the above example, is a **0-level LoD Tensor**.
 
-- The LoD index is not necessary when there are only two levels and all elements of the second level have length 1.
 
-## Slicing of LoD Tensor
+## The LoD Tensor
 
-Consider that we have a network with three levels of RNN: the top level one handles articles, the second level one handles sentences, and the basic level one handles words.  This network requires that mini-batches represented by 3 level LoD Tensor, for example,
+Let us revisit above example of the 2-level LoD Tensor
 
 ```
-         3
 3           1  2
 3   2  4    1  2  3
 ||| || |||| |  || |||
 ```
 
-To allow each level of RNN to handle its input, we define **the slicing of a LoD Tensor is defined as getting the j-th sequence on level i, or the <i,j>-slice**
+It is indeed a tree, where leaves are elementary sequences identified by **branches**.
+
+For example, the third sentence in above example is identified by branch <0,2>, where 0 indicates the first article with length 3, and 2 indicates the third sentence in this article with length 4.
+
+### The LoD Index
 
-For example, the <2,1>-slice of above slice is
+We can save the LoD index in the above example
 
 ```
-2
-||
+3           1  2
+3   2  4    1  2  3
 ```
 
-and the <1,2>-slice of above example is
+in a not-full 2D matrix:
 
+```c++
+typedef std::vector<std::vector<int> > LoD;
 ```
-2
-2  3
-|| |||
-```
 
-Let's go on slicing this slice.  Its <1,1>-slice is
+where
+
+- `LoD.size()` is the number of levels, or the maximum length of branches,
+- `LoD[i][j]` is the length of the j-th segment at the i-th level.
+
+## The Offset Representation
+
+To quickly access elementary sequences, we adopt an offset representation -- instead of saving the lengths, we save the beginning and ending elements of sequences.
+
+In the above example, we accumulate the length of elementary sequences:
 
 ```
-1
-1
-|
+3 2 4 1 2 3
 ```
 
-### The Slicing Algorithm
+into offsets
 
-The algorithm, with over-simplified data structure, is defined as
+```
+0  3  5   9   10  12   15
+   =  =   =   =   =    =
+   3  2+3 4+5 1+9 2+10 3+12
+```
 
-```c++
-typedef std::vector<std::vector<int>> LoD;
+so we know that the first sentence is from word 0 to word 3, and the second sentence from work 3 to word 5.
 
-struct LoDTensor {
-  LoD lod_;
-  float* tensor_;
-};
+Similarly, the lengths in the top level LoD
 
-LoDTensor Slice(const LoDTensor& lodt, int level, int sequence);
+```
+3 1 2
 ```
 
-Let us revisit the example above
+are transformed into offsets of elements/words as follows:
 
 ```
-         3
-3           1  2
-3   2  4    1  2  3
-||| || |||| |  || |||
+0 9     10  15
+  =     =   =
+  3+2+4 1+9 2+3+10
 ```
 
-Suppose that we want to retrieve the <1,2>-slice
+so we can tell that the first article is from word 0 to word 9, and the second article is from word 9 to word 10.
+
+The complete offset representation is as follows:
 
 ```
-2
-2  3
-|| |||
+0           9 10       15
+0   3  5    9 10  12   15
+ ||| || |||| |  ||  |||
 ```
 
-we will need to find out the starting position of this slice by summing over all leaf nodes in `LoD` to the left of the slice, i.e., 3 + 2 + 4 + 1 = 10.
+## Slicing of LoD Tensors
+
+When we use the above 2-level LoD Tensor as the input to a nested-RNN, we need to retrieve certain sequences.  Here we define the sequence identified by branch <i,j,...> as the **<i,j,...>-slice**.
 
-To avoid the traversal of the LoD tree at slicing time,  we can do it at the construction time -- instead of saving the lengths of the next level in the LoD tree, we can save the starting offset of the next level.  For example, above LoD Tensor can be transformed into
+For example, the <2>-slice of above example is
 
 ```
-        0
-0           9  10
-0   3  5    9  10 12
-||| || |||| |  || |||
+10      15
+10  12  15
+  || |||
 ```
 
-We don't really need the 0 on top, so the LoD Tensor could be
+and the <2,0>-slice of above slice is
 
 ```
-0           9  10
-0   3  5    9  10 12
-||| || |||| |  || |||
+10  12
+  ||
 ```
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 44f09f584fb752d7003baa804979f3bb5cd9d651..b984d620717453456fb15620b4d10c4268be8a94 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -17,10 +17,13 @@
 #include <gtest/gtest.h>
 #include <algorithm>
 #include <memory>
+#include <vector>
 
 namespace paddle {
 namespace framework {
 
+const int kLodTensorSize = 20 * 128;
+
 class LoDTensorTester : public ::testing::Test {
  public:
   virtual void SetUp() override {
@@ -30,15 +33,18 @@ class LoDTensorTester : public ::testing::Test {
     // 0 5 10 15 20
     // 0 2 5 7 10 12 15 20
     LoD lod;
-    lod.push_back(std::vector<size_t>{0, 10, 20});
-    lod.push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod.push_back(std::vector<size_t>{0, 2, 3});
+    lod.push_back(std::vector<size_t>{0, 2, 5, 8});
     lod.push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
 
     ASSERT_EQ(lod.size(), 3UL);
 
     lod_tensor_.Resize({20 /*batch size*/, 128 /*dim*/});
     // malloc memory
-    lod_tensor_.mutable_data<float>(place);
+    float* dst_ptr = lod_tensor_.mutable_data<float>(place);
+    for (int i = 0; i < kLodTensorSize; ++i) {
+      dst_ptr[i] = i;
+    }
 
     lod_tensor_.set_lod(lod);
   }
@@ -52,14 +58,14 @@ TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
 
 TEST_F(LoDTensorTester, NumElements) {
   ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
   ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
 }
 
 TEST_F(LoDTensorTester, NumElements2) {
   ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 2UL);
-  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 2UL);
+  ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
+  ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
 }
 
 TEST_F(LoDTensorTester, ShrinkLevels) {
@@ -68,17 +74,16 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
     LoDTensor new_lod_tensor = lod_tensor_;
     new_lod_tensor.ShrinkLevels(level, level + 1);
     ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
     ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
   }
   // shrink 2 level
   for (size_t level = 0; level < 2UL; ++level) {
     LoDTensor new_lod_tensor = lod_tensor_;
     new_lod_tensor.ShrinkLevels(level, level + 2);
+    // the lowest level's last element should be the tensor's batch_size.
+    ASSERT_EQ(new_lod_tensor.lod().back().back(),
+              lod_tensor_.lod().back().back());
     ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor_.NumElements(level));
-    ASSERT_EQ(new_lod_tensor.NumElements(1),
-              lod_tensor_.NumElements(level + 1));
     ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
   }
 }
@@ -86,21 +91,37 @@ TEST_F(LoDTensorTester, ShrinkLevels) {
 TEST_F(LoDTensorTester, ShrinkInLevel) {
   size_t level = 0;
   LoDTensor new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 0, 1);
   EXPECT_EQ(new_lod_tensor.NumLevels(), 3UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(1), 4UL);
-  EXPECT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(0), 1UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(1), 2UL);
+  EXPECT_EQ(new_lod_tensor.NumElements(2), 5UL);
   ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
 
   level = 1;
   new_lod_tensor = lod_tensor_;
-  new_lod_tensor.ShrinkInLevel(level, 0, 2);
+  new_lod_tensor.ShrinkInLevel(level, 1, 2);
   ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
-  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
   ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
 }
 
+TEST_F(LoDTensorTester, SerializeDeserialize) {
+  LoDTensor new_lod_tensor = lod_tensor_;
+  float* src_ptr = lod_tensor_.data<float>();
+  std::string s = lod_tensor_.SerializeToString();
+  LoDTensor dst;
+  dst.DeserializeFromString(s, platform::CPUPlace());
+  float* dst_ptr = dst.data<float>();
+  for (int i = 0; i < kLodTensorSize; ++i) {
+    EXPECT_EQ(dst_ptr[i], src_ptr[i]);
+  }
+
+  ASSERT_EQ(dst.NumElements(0), 2UL);
+  ASSERT_EQ(dst.NumElements(1), 3UL);
+  ASSERT_EQ(dst.NumElements(2), 8UL);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cu b/paddle/framework/lod_tensor_test.cu
index 647d07536dd070bc37137fc01f683ec07ba7d6f4..11659be02ac340728150cf0a6438db8626c8e611 100644
--- a/paddle/framework/lod_tensor_test.cu
+++ b/paddle/framework/lod_tensor_test.cu
@@ -36,8 +36,8 @@ TEST(LoDTensor, LoDInGPU) {
   lod_tensor.mutable_data<float>(place);
 
   lod_tensor.set_lod(src_lod);
-  CHECK_EQ(lod_tensor.lod_element(0, 2), 4UL);
-  CHECK_EQ(lod_tensor.lod_element(0, 4), 8UL);
+  CHECK_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
+  CHECK_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
 
   auto lod = lod_tensor.lod();
 
@@ -48,3 +48,30 @@ TEST(LoDTensor, LoDInGPU) {
     CHECK_EQ(lod[0].data()[i], src_lod[0].data()[i] * 2);
   }
 }
+
+TEST(LoDTensor, SerializeDeserialize) {
+  paddle::framework::LoDTensor lod_tensor;
+  paddle::platform::GPUPlace place(0);
+
+  paddle::framework::LoD src_lod;
+  src_lod.push_back(std::vector<size_t>{0, 2, 4, 6, 8, 10, 12, 14});
+
+  lod_tensor.Resize({14, 16});
+  lod_tensor.mutable_data<float>(place);
+
+  lod_tensor.set_lod(src_lod);
+  CHECK_EQ(lod_tensor.lod_element(0, 2).first, 4UL);
+  CHECK_EQ(lod_tensor.lod_element(0, 4).first, 8UL);
+
+  test<<<1, 8>>>(src_lod[0].data(), src_lod[0].size());
+  cudaDeviceSynchronize();
+
+  std::string s = lod_tensor.SerializeToString();
+  paddle::framework::LoDTensor dst;
+  dst.DeserializeFromString(s, place);
+  paddle::framework::LoD dst_lod = dst.lod();
+
+  for (size_t i = 0; i < dst_lod[0].size(); ++i) {
+    CHECK_EQ(src_lod[0].data()[i], dst_lod[0].data()[i] * 2);
+  }
+}
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 99b5a9c37700adce56f9a83af3792ef113a873ff..18fabe481dac9c1b70e7c30cb83ec5ee8ac47026 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -13,13 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/op_desc.h"
+#include <functional>
+#include <unordered_map>
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
 
+OpDescBind::OpDescBind(const std::string &type, const VariableNameMap &inputs,
+                       const VariableNameMap &outputs,
+                       const AttributeMap &attrs) {
+  op_desc_.set_type(type);
+  inputs_ = inputs;
+  outputs_ = outputs;
+  attrs_ = attrs;
+  need_update_ = true;
+}
+
 OpDesc *OpDescBind::Proto() {
-  Sync();
+  Flush();
   return &op_desc_;
 }
 
@@ -31,11 +44,10 @@ const std::vector<std::string> &OpDescBind::Input(
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::InputNames() const {
+std::vector<std::string> OpDescBind::InputArgumentNames() const {
   std::vector<std::string> retv;
-  retv.reserve(this->inputs_.size());
   for (auto &ipt : this->inputs_) {
-    retv.push_back(ipt.first);
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
   }
   return retv;
 }
@@ -54,11 +66,10 @@ const std::vector<std::string> &OpDescBind::Output(
   return it->second;
 }
 
-std::vector<std::string> OpDescBind::OutputNames() const {
+std::vector<std::string> OpDescBind::OutputArgumentNames() const {
   std::vector<std::string> retv;
-  retv.reserve(this->outputs_.size());
   for (auto &ipt : this->outputs_) {
-    retv.push_back(ipt.first);
+    retv.insert(retv.end(), ipt.second.begin(), ipt.second.end());
   }
   return retv;
 }
@@ -89,6 +100,18 @@ void OpDescBind::SetAttr(const std::string &name, const Attribute &v) {
   need_update_ = true;
 }
 
+void OpDescBind::SetBlockAttr(const std::string &name, BlockDescBind &block) {
+  BlockDesc *desc = block.Proto();
+  this->attrs_[name] = desc;
+  need_update_ = true;
+}
+
+void OpDescBind::SetAttrMap(
+    const std::unordered_map<std::string, Attribute> &attr_map) {
+  attrs_ = attr_map;
+  need_update_ = true;
+}
+
 Attribute OpDescBind::GetAttr(const std::string &name) const {
   auto it = attrs_.find(name);
   PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name);
@@ -101,7 +124,48 @@ int OpDescBind::GetBlockAttr(const std::string &name) const {
   return boost::get<BlockDesc *>(it->second)->idx();
 }
 
-void OpDescBind::Sync() {
+const std::unordered_map<std::string, Attribute> &OpDescBind::GetAttrMap()
+    const {
+  return attrs_;
+}
+
+void OpDescBind::Rename(const std::string &old_name,
+                        const std::string &new_name) {
+  for (auto &input : inputs_) {
+    std::replace(input.second.begin(), input.second.end(), old_name, new_name);
+  }
+  for (auto &output : outputs_) {
+    std::replace(output.second.begin(), output.second.end(), old_name,
+                 new_name);
+  }
+  need_update_ = true;
+}
+
+struct SetAttrDescVisitor : public boost::static_visitor<void> {
+  explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
+  mutable OpDesc::Attr *attr_;
+  void operator()(int v) const { attr_->set_i(v); }
+  void operator()(float v) const { attr_->set_f(v); }
+  void operator()(const std::string &v) const { attr_->set_s(v); }
+  void operator()(bool b) const { attr_->set_b(b); }
+
+  void operator()(const std::vector<int> &v) const {
+    VectorToRepeated(v, attr_->mutable_ints());
+  }
+  void operator()(const std::vector<float> &v) const {
+    VectorToRepeated(v, attr_->mutable_floats());
+  }
+  void operator()(const std::vector<std::string> &v) const {
+    VectorToRepeated(v, attr_->mutable_strings());
+  }
+  void operator()(const std::vector<bool> &v) const {
+    VectorToRepeated(v, attr_->mutable_bools());
+  }
+  void operator()(BlockDesc *desc) const { attr_->set_block_idx(desc->idx()); }
+  void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
+};
+
+void OpDescBind::Flush() {
   if (need_update_) {
     this->op_desc_.mutable_inputs()->Clear();
     for (auto &ipt : inputs_) {
@@ -123,11 +187,71 @@ void OpDescBind::Sync() {
       attr_desc->set_name(attr.first);
       attr_desc->set_type(
           static_cast<framework::AttrType>(attr.second.which() - 1));
-      boost::apply_visitor(SetAttrDescVisitor(attr_desc), attr.second);
+      SetAttrDescVisitor visitor(attr_desc);
+      boost::apply_visitor(visitor, attr.second);
     }
 
     need_update_ = false;
   }
 }
+
+using InferShapeFuncMap =
+    std::unordered_map<std::string /*op_type*/,
+                       std::function<void(InferShapeContext *)>>;
+
+static InferShapeFuncMap &InferShapeFuncs() {
+  static InferShapeFuncMap *g_map = nullptr;
+  if (g_map == nullptr) {
+    g_map = new InferShapeFuncMap();
+    auto &info_map = OpInfoMap::Instance();
+    // all registered kernels
+    for (auto &pair : OperatorWithKernel::AllOpKernels()) {
+      auto &info = info_map.Get(pair.first);
+      // use empty type here to avoid runtime checks.
+      auto op =
+          static_cast<OperatorWithKernel *>(info.Creator()("", {}, {}, {}));
+      g_map->insert(
+          {pair.first, [op](InferShapeContext *ctx) { op->InferShape(ctx); }});
+    }
+  }
+  return *g_map;
+}
+
+void OpDescBind::CheckAttrs() {
+  PADDLE_ENFORCE(!Type().empty(),
+                 "CheckAttr() can not be called before type is setted.");
+  auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
+  if (checker == nullptr) {
+    // checker is not configured. That operator could be generated by Paddle,
+    // not by users.
+    return;
+  }
+  checker->Check(attrs_);
+}
+
+void OpDescBind::InferShape(const BlockDescBind &block) const {
+  auto &funcs = InferShapeFuncs();
+  auto it = funcs.find(this->Type());
+  if (it == funcs.end()) {
+    PADDLE_THROW("Operator %s has not been registered", this->Type());
+  }
+  CompileTimeInferShapeContext ctx(*this, block);
+  it->second(&ctx);
+}
+
+void OpDescBind::InferVarType(BlockDescBind *block) const {
+  auto &info = OpInfoMap::Instance().Get(this->Type());
+  if (info.infer_var_type_) {
+    info.infer_var_type_(*this, block);
+  } else {
+    // all output type is LoDTensor by default
+    for (auto &out_pair : this->outputs_) {
+      for (auto &out_var_name : out_pair.second) {
+        block->Var(out_var_name)->SetType(VarDesc::LOD_TENSOR);
+      }
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.h b/paddle/framework/op_desc.h
index ffc8ac61abfb74e4716f10c457d0fbc18b2e2ab8..313bf538ac7c947c5e77ca0ead6bb53e6a156478 100644
--- a/paddle/framework/op_desc.h
+++ b/paddle/framework/op_desc.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/type_defs.h"
 #include "paddle/framework/var_desc.h"
 
 namespace paddle {
@@ -26,6 +27,11 @@ class BlockDescBind;
 
 class OpDescBind {
  public:
+  OpDescBind() {}
+
+  OpDescBind(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs, const AttributeMap &attrs);
+
   OpDesc *Proto();
 
   std::string Type() const { return op_desc_.type(); }
@@ -34,20 +40,18 @@ class OpDescBind {
 
   const std::vector<std::string> &Input(const std::string &name) const;
 
-  std::vector<std::string> InputNames() const;
+  std::vector<std::string> InputArgumentNames() const;
 
   void SetInput(const std::string &param_name,
                 const std::vector<std::string> &args);
 
   const std::vector<std::string> &Output(const std::string &name) const;
 
-  std::vector<std::string> OutputNames() const;
+  std::vector<std::string> OutputArgumentNames() const;
 
   void SetOutput(const std::string &param_name,
                  const std::vector<std::string> &args);
 
-  std::string DebugString() { return this->Proto()->DebugString(); }
-
   bool HasAttr(const std::string &name) const {
     return attrs_.find(name) != attrs_.end();
   }
@@ -64,39 +68,59 @@ class OpDescBind {
 
   int GetBlockAttr(const std::string &name) const;
 
+  void Rename(const std::string &old_name, const std::string &new_name);
+
+  // Only be used in C++
+  const AttributeMap &GetAttrMap() const;
+
+  // Only be used in C++
+  void SetAttrMap(const AttributeMap &attr_map);
+
+  std::vector<std::string> InputNames() const { return MapKeys(inputs_); }
+  std::vector<std::string> OutputNames() const { return MapKeys(outputs_); }
+
+  void SetInputMap(const VariableNameMap &input) {
+    this->inputs_ = input;
+    this->need_update_ = true;
+  }
+
+  void SetOutputMap(const VariableNameMap &output) {
+    this->outputs_ = output;
+    this->need_update_ = true;
+  }
+
+  const VariableNameMap &Inputs() const { return inputs_; }
+
+  const VariableNameMap &Outputs() const { return outputs_; }
+
+  AttributeMap *MutableAttrMap() {
+    this->need_update_ = true;
+    return &this->attrs_;
+  }
+
+  void CheckAttrs();
+
+  void InferShape(const BlockDescBind &block) const;
+
+  void InferVarType(BlockDescBind *block) const;
+
+  void Flush();
+
  private:
-  struct SetAttrDescVisitor : public boost::static_visitor<void> {
-    explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
-    mutable OpDesc::Attr *attr_;
-    void operator()(int v) const { attr_->set_i(v); }
-    void operator()(float v) const { attr_->set_f(v); }
-    void operator()(const std::string &v) const { attr_->set_s(v); }
-    void operator()(bool b) const { attr_->set_b(b); }
-
-    void operator()(const std::vector<int> &v) const {
-      VectorToRepeated(v, attr_->mutable_ints());
-    }
-    void operator()(const std::vector<float> &v) const {
-      VectorToRepeated(v, attr_->mutable_floats());
-    }
-    void operator()(const std::vector<std::string> &v) const {
-      VectorToRepeated(v, attr_->mutable_strings());
-    }
-    void operator()(const std::vector<bool> &v) const {
-      VectorToRepeated(v, attr_->mutable_bools());
-    }
-    void operator()(BlockDesc *desc) const {
-      attr_->set_block_idx(desc->idx());
-    }
-    void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); }
-  };
-
-  void Sync();
+  template <typename MapType>
+  static std::vector<typename MapType::key_type> MapKeys(const MapType &map) {
+    std::vector<typename MapType::key_type> ret_val;
+    ret_val.reserve(map.size());
+    std::transform(
+        map.begin(), map.end(), std::back_inserter(ret_val),
+        [](const typename MapType::value_type &pair) { return pair.first; });
+    return ret_val;
+  }
 
   OpDesc op_desc_;
-  std::unordered_map<std::string, std::vector<std::string>> inputs_;
-  std::unordered_map<std::string, std::vector<std::string>> outputs_;
-  std::unordered_map<std::string, Attribute> attrs_;
+  VariableNameMap inputs_;
+  VariableNameMap outputs_;
+  AttributeMap attrs_;
 
   // need_update_ indicate there some local changes not be synchronized. If
   // local changes should be synchronized, need_update_ should be set to true.
diff --git a/paddle/framework/op_info.h b/paddle/framework/op_info.h
index b98d8f23a14cf6fbe787953ad16b5c9ab99222ad..59a64d71371b546f76eabdeed7e7514e8fb0f84a 100644
--- a/paddle/framework/op_info.h
+++ b/paddle/framework/op_info.h
@@ -19,21 +19,18 @@
 #include <unordered_map>
 
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/type_defs.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
-class OperatorBase;
-using VariableNameMap = std::map<std::string, std::vector<std::string>>;
-
-using OpCreator = std::function<OperatorBase*(
-    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
-    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
 
 struct OpInfo {
   OpCreator creator_;
-  std::string grad_op_type_;
-  OpProto* proto_;
-  OpAttrChecker* checker_;
+  GradOpMakerFN grad_op_maker_;
+  OpProto* proto_{nullptr};
+  OpAttrChecker* checker_{nullptr};
+  InferVarTypeFN infer_var_type_;
 
   bool HasOpProtoAndChecker() const {
     return proto_ != nullptr && checker_ != nullptr;
@@ -46,30 +43,25 @@ struct OpInfo {
     return *proto_;
   }
 
-  const OpAttrChecker& Checker() const {
-    PADDLE_ENFORCE_NOT_NULL(checker_,
-                            "Operator Checker has not been registered");
-    return *checker_;
-  }
-
   const OpCreator& Creator() const {
     PADDLE_ENFORCE_NOT_NULL(creator_,
                             "Operator Creator has not been registered");
     return creator_;
   }
 
-  bool HasGradientOp() const { return !grad_op_type_.empty(); }
+  const GradOpMakerFN& GradOpMaker() const {
+    PADDLE_ENFORCE_NOT_NULL(grad_op_maker_,
+                            "Operator GradOpMaker has not been registered.");
+    return grad_op_maker_;
+  }
+
+  const OpAttrChecker* Checker() const { return checker_; }
 };
 
 class OpInfoMap {
  public:
   static OpInfoMap& Instance();
 
-  OpInfoMap(const OpInfoMap& o) = delete;
-  OpInfoMap(OpInfoMap&& o) = delete;
-  OpInfoMap& operator=(const OpInfoMap& o) = delete;
-  OpInfoMap& operator=(OpInfoMap&& o) = delete;
-
   bool Has(const std::string& op_type) const {
     return map_.find(op_type) != map_.end();
   }
@@ -95,16 +87,15 @@ class OpInfoMap {
     }
   }
 
-  template <typename Callback>
-  void IterAllInfo(Callback callback) {
-    for (auto& it : map_) {
-      callback(it.first, it.second);
-    }
+  const std::unordered_map<std::string, const OpInfo>& map() const {
+    return map_;
   }
 
  private:
   OpInfoMap() = default;
   std::unordered_map<std::string, const OpInfo> map_;
+
+  DISABLE_COPY_AND_ASSIGN(OpInfoMap);
 };
 
 }  // namespace framework
diff --git a/paddle/framework/op_proto_maker.h b/paddle/framework/op_proto_maker.h
index 4d55a37db9f0a3deac7b3489c8bc288ea41f4799..44e8ab16895cc604f85bb83e240eab55739f8ba0 100644
--- a/paddle/framework/op_proto_maker.h
+++ b/paddle/framework/op_proto_maker.h
@@ -45,8 +45,8 @@ class OpProtoAndCheckerMaker {
       return *this;
     }
 
-    VariableBuilder& NotInGradient() {
-      var_->set_not_in_gradient(true);
+    VariableBuilder& AsDispensable() {
+      var_->set_dispensable(true);
       return *this;
     }
   };
diff --git a/paddle/framework/op_proto_maker_test.cc b/paddle/framework/op_proto_maker_test.cc
index b01e30f75371ca4aa63dae86ddfb966b1d4c7830..988a14cf4de8fdf052ca7e8c41bff0c05ba2daaa 100644
--- a/paddle/framework/op_proto_maker_test.cc
+++ b/paddle/framework/op_proto_maker_test.cc
@@ -48,4 +48,4 @@ TEST(ProtoMaker, DuplicatedInOut) {
   paddle::framework::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
   ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
-}
\ No newline at end of file
+}
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
index b0e85dd49f97da4a7f889fde0b5f060954947be8..c2f2438edf6daadf26cbc6db37f6668739ab1726 100644
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
@@ -23,7 +23,9 @@ std::unique_ptr<OperatorBase> OpRegistry::CreateOp(
     const std::string& type, const VariableNameMap& inputs,
     const VariableNameMap& outputs, AttributeMap attrs) {
   auto& info = OpInfoMap::Instance().Get(type);
-  info.Checker().Check(attrs);
+  if (info.Checker() != nullptr) {
+    info.Checker()->Check(attrs);
+  }
   auto op = info.Creator()(type, inputs, outputs, attrs);
   return std::unique_ptr<OperatorBase>(op);
 }
@@ -41,20 +43,21 @@ static VariableNameMap ConvertOpDescVarsToVarNameMap(
   return ret_val;
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc) {
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDesc& op_desc,
+                                                   ProgramDesc* program) {
   VariableNameMap inputs = ConvertOpDescVarsToVarNameMap(op_desc.inputs());
   VariableNameMap outputs = ConvertOpDescVarsToVarNameMap(op_desc.outputs());
   AttributeMap attrs;
   for (auto& attr : op_desc.attrs()) {
-    attrs[attr.name()] = GetAttrValue(attr);
+    attrs[attr.name()] = GetAttrValue(attr, program);
   }
 
   return CreateOp(op_desc.type(), inputs, outputs, attrs);
 }
 
-std::unique_ptr<OperatorBase> OpRegistry::CreateGradOp(const OperatorBase& op) {
-  PADDLE_ENFORCE(!op.IsNetOp(), "Use framework::Backward to get backward ops");
-  return std::unique_ptr<OperatorBase>(BuildGradOp(&op));
+std::unique_ptr<OperatorBase> OpRegistry::CreateOp(const OpDescBind& op_desc) {
+  return CreateOp(op_desc.Type(), op_desc.Inputs(), op_desc.Outputs(),
+                  op_desc.GetAttrMap());
 }
 
 }  // namespace framework
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 90077d0192421f3678a049a723972fcb1e8d67af..ed85c386ec2632604bf5faf0ff9b1a087eb9c276 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -20,50 +20,54 @@ limitations under the License. */
 #include <typeinfo>
 #include <unordered_map>
 #include <unordered_set>
+
+#include "glog/logging.h"  // For VLOG()
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/details/op_registry.h"
 #include "paddle/framework/framework.pb.h"
-#include "paddle/framework/grad_op_builder.h"
-#include "paddle/framework/op_info.h"
-#include "paddle/framework/op_proto_maker.h"
+#include "paddle/framework/grad_op_desc_maker.h"
+#include "paddle/framework/op_desc.h"
 #include "paddle/framework/operator.h"
 #include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels,
+  // have their corresponding registry and registrar. The action of
+  // registration is in the constructor of a global registrar variable, which,
+  // however, are not used in the code that calls package framework, and would
+  // be removed from the generated binary file by the linker. To avoid such
+  // removal, we add Touch to all registrar classes and make USE_OP macros to
+  // call this method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
+template <typename... ARGS>
+struct OperatorRegistrar : public Registrar {
+  explicit OperatorRegistrar(const char* op_type) {
+    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
+                   "'%s' is registered more than once.", op_type);
+    static_assert(sizeof...(ARGS) != 0,
+                  "OperatorRegistrar should be invoked at least by OpClass");
+    OpInfo info;
+    details::OperatorRegistrarRecursive<0, false, ARGS...>(op_type, &info);
+    OpInfoMap::Instance().Insert(op_type, info);
+  }
+};
 
 class OpRegistry {
  public:
   template <typename OpType, typename ProtoMakerType, typename GradOpType>
   static void RegisterOp(const std::string& op_type,
                          const std::string& grad_op_type) {
-    PADDLE_ENFORCE(!OpInfoMap::Instance().Has(op_type),
-                   "'%s' is registered more than once.", op_type);
-    OpInfo op_info;
-    op_info.creator_ = [](
-        const std::string& type, const VariableNameMap& inputs,
-        const VariableNameMap& outputs, const AttributeMap& attrs) {
-      return new OpType(type, inputs, outputs, attrs);
-    };
-    op_info.grad_op_type_ = grad_op_type;
-    if (std::type_index(typeid(ProtoMakerType)) !=
-        std::type_index(typeid(NOPMaker))) {
-      op_info.proto_ = new OpProto;
-      op_info.checker_ = new OpAttrChecker;
-      auto maker = ProtoMakerType(op_info.proto_, op_info.checker_);
-      maker.Validate();
-      op_info.proto_->set_type(op_type);
-      PADDLE_ENFORCE(
-          op_info.proto_->IsInitialized(),
-          "Fail to initialize %s's OpProto, because %s is not initialized",
-          op_type, op_info.proto_->InitializationErrorString());
-    } else {
-      op_info.proto_ = nullptr;
-      op_info.checker_ = nullptr;
-    }
-    OpInfoMap::Instance().Insert(op_type, op_info);
+    OperatorRegistrar<OpType, ProtoMakerType> reg(op_type.c_str());
+    reg.info.grad_op_type_ = grad_op_type;
     // register gradient op
     if (!grad_op_type.empty()) {
-      RegisterOp<GradOpType, NOPMaker, NOP>(grad_op_type, "");
+      OperatorRegistrar<GradOpType> grad_reg(grad_op_type.c_str());
     }
   }
 
@@ -72,41 +76,45 @@ class OpRegistry {
                                                 const VariableNameMap& outputs,
                                                 AttributeMap attrs);
 
-  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDesc& op_desc,
+                                                ProgramDesc* program);
 
-  static std::unique_ptr<OperatorBase> CreateGradOp(const OperatorBase& op);
+  static std::unique_ptr<OperatorBase> CreateOp(const OpDescBind& op_desc);
 };
 
-class Registrar {
- public:
-  // In our design, various kinds of classes, e.g., operators and kernels,
-  // have their corresponding registry and registrar. The action of
-  // registration is in the constructor of a global registrar variable, which,
-  // however, are not used in the code that calls package framework, and would
-  // be removed from the generated binary file by the linker. To avoid such
-  // removal, we add Touch to all registrar classes and make USE_OP macros to
-  // call this method. So, as long as the callee code calls USE_OP, the global
-  // registrar variable won't be removed by the linker.
-  void Touch() {}
-};
+template <typename PlaceType, bool at_end, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor;
 
-template <typename OpType, typename ProtoMakerType, typename GradOpType>
-class OpRegistrar : public Registrar {
- public:
-  explicit OpRegistrar(const char* op_type) { OpRegistrar(op_type, ""); }
-  OpRegistrar(const char* op_type, const char* grad_op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType, GradOpType>(op_type,
-                                                               grad_op_type);
+template <typename PlaceType, size_t I, typename... KernelTypes>
+struct OpKernelRegistrarFunctor<PlaceType, false, I, KernelTypes...> {
+  using KERNEL_TYPE =
+      typename std::tuple_element<I, std::tuple<KernelTypes...>>::type;
+
+  void operator()(const char* op_type) const {
+    using T = typename KERNEL_TYPE::ELEMENT_TYPE;
+    OperatorWithKernel::OpKernelKey key(ToDataType(std::type_index(typeid(T))),
+                                        PlaceType());
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KERNEL_TYPE);
+
+    constexpr auto size = std::tuple_size<std::tuple<KernelTypes...>>::value;
+    OpKernelRegistrarFunctor<PlaceType, I + 1 == size, I + 1, KernelTypes...>
+        func;
+    func(op_type);
   }
 };
 
-template <typename PlaceType, typename KernelType>
+template <typename PlaceType, size_t I, typename... KernelType>
+struct OpKernelRegistrarFunctor<PlaceType, true, I, KernelType...> {
+  void operator()(const char* op_type) const {}
+};
+
+// User can register many kernel in one place. The data type could be different.
+template <typename PlaceType, typename... KernelType>
 class OpKernelRegistrar : public Registrar {
  public:
   explicit OpKernelRegistrar(const char* op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = PlaceType();
-    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
+    OpKernelRegistrarFunctor<PlaceType, false, 0, KernelType...> func;
+    func(op_type);
   }
 };
 
@@ -119,33 +127,42 @@ class OpKernelRegistrar : public Registrar {
                              __test_global_namespace_##uniq_name##__>::value, \
                 msg)
 
+#define REGISTER_OPERATOR(op_type, op_class, ...)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                      \
+      __reg_op__##op_type,                                             \
+      "REGISTER_OPERATOR must be called in global namespace");         \
+  class _OpClass_##op_type##_ : public op_class {                      \
+   public:                                                             \
+    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                     \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);            \
+  };                                                                   \
+  static ::paddle::framework::OperatorRegistrar<_OpClass_##op_type##_, \
+                                                ##__VA_ARGS__>         \
+      __op_registrar_##op_type##__(#op_type);                          \
+  int TouchOpRegistrar_##op_type() {                                   \
+    __op_registrar_##op_type##__.Touch();                              \
+    return 0;                                                          \
+  }
+
 /**
  * Macro to register Operator.
  */
-#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,          \
-                    grad_op_class)                                            \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
-      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
-  class _OpClass_##op_type##_ : public op_class {                             \
-   public:                                                                    \
-    DEFINE_OP_CLONE_METHOD(_OpClass_##op_type##_);                            \
-    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_, op_class);                   \
-  };                                                                          \
-  class _OpGradClass_##op_type##_ : public grad_op_class {                    \
-   public:                                                                    \
-    DEFINE_OP_CLONE_METHOD(_OpGradClass_##op_type##_);                        \
-    DEFINE_OP_CONSTRUCTOR(_OpGradClass_##op_type##_, grad_op_class);          \
-  };                                                                          \
-  static ::paddle::framework::OpRegistrar<                                    \
-      _OpClass_##op_type##_, op_maker_class, _OpGradClass_##op_type##_>       \
-      __op_registrar_##op_type##__(#op_type, #grad_op_type);                  \
-  int TouchOpRegistrar_##op_type() {                                          \
-    __op_registrar_##op_type##__.Touch();                                     \
-    return 0;                                                                 \
-  }
+#define REGISTER_OP(op_type, op_class, op_maker_class, grad_op_type,       \
+                    grad_op_class)                                         \
+  REGISTER_OPERATOR(grad_op_type, grad_op_class);                          \
+  class _GradOpDescMaker_##grad_op_type##_                                 \
+      : public ::paddle::framework::DefaultGradOpDescMaker<true> {         \
+    using ::paddle::framework::DefaultGradOpDescMaker<                     \
+        true>::DefaultGradOpDescMaker;                                     \
+                                                                           \
+   protected:                                                              \
+    virtual std::string GradOpType() const { return #grad_op_type; }       \
+  };                                                                       \
+  REGISTER_OPERATOR(op_type, op_class, _GradOpDescMaker_##grad_op_type##_, \
+                    op_maker_class);
 
 #define REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) \
-  REGISTER_OP(op_type, op_class, op_maker_class, , ::paddle::framework::NOP)
+  REGISTER_OPERATOR(op_type, op_class, op_maker_class)
 
 /**
  * Macro to register OperatorKernel.
@@ -192,7 +209,7 @@ class OpKernelRegistrar : public Registrar {
 // TODO(fengjiayi): The following macros
 // seems ugly, do we have better method?
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 #define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
 #define USE_OP_KERNEL(op_type)        \
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index b6fc0409d5cb22b13352df41b8e911c79bc4825a..6289125d7c782e542e5c55e1d4403836351b7e05 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -74,7 +74,7 @@ TEST(OpRegistry, CreateOp) {
   attr->set_type(paddle::framework::AttrType::FLOAT);
   attr->set_f(scale);
 
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
@@ -95,7 +95,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
+    paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "larger_than check fail";
@@ -115,7 +115,7 @@ TEST(OpRegistry, DefaultValue) {
 
   ASSERT_TRUE(op_desc.IsInitialized());
 
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   paddle::framework::Scope scope;
   paddle::platform::CPUDeviceContext dev_ctx;
   op->Run(scope, dev_ctx);
@@ -131,7 +131,7 @@ TEST(OpRegistry, CustomChecker) {
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
+    paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "Attribute 'test_attr' is required!";
@@ -149,7 +149,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OpRegistry::CreateOp(op_desc);
+    paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "'test_attr' must be even!";
@@ -166,10 +166,21 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   paddle::platform::CPUDeviceContext dev_ctx;
   paddle::framework::Scope scope;
   op->Run(scope, dev_ctx);
   int test_attr = op->Attr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
+
+class CosineOpComplete : public paddle::framework::CosineOp {
+ public:
+  DEFINE_OP_CONSTRUCTOR(CosineOpComplete, paddle::framework::CosineOp);
+  DEFINE_OP_CLONE_METHOD(CosineOpComplete);
+};
+
+TEST(OperatorRegistrar, Test) {
+  using namespace paddle::framework;
+  OperatorRegistrar<CosineOpComplete, CosineOpProtoAndCheckerMaker> reg("cos");
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index d7beff5bc1df1def6bf35381e103cf87eeb68fd0..a67625fa88fd2fbe4db43241ee824519ceac7017 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -22,14 +22,14 @@ namespace framework {
 template <>
 Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
     platform::CPUPlace, Eigen::DefaultDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+  return *device_context_.GetEigenDevice<platform::CPUPlace>();
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 Eigen::GpuDevice&
 ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
-  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+  return *device_context_.GetEigenDevice<platform::GPUPlace>();
 }
 #endif
 
@@ -205,13 +205,13 @@ void OperatorBase::GenerateTemporaryNames() {
 }
 
 template <>
-const Tensor* InferShapeContext::Input<Tensor>(const std::string& name) const {
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const {
   auto* var = InputVar(name);
   return var == nullptr ? nullptr : GetTensorFromVar(var);
 }
 
 template <>
-const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const {
   auto names = op().Inputs(name);
   std::vector<const Tensor*> res;
@@ -225,13 +225,13 @@ const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
 }
 
 template <>
-Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const {
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const {
   auto var = OutputVar(name);
   return var == nullptr ? nullptr : var->GetMutable<LoDTensor>();
 }
 
 template <>
-std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const {
   auto names = op().Outputs(name);
   std::vector<Tensor*> res;
@@ -245,5 +245,27 @@ std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
   return res;
 }
 
+std::ostream& operator<<(std::ostream& os,
+                         const OperatorWithKernel::OpKernelKey& kernel_key) {
+  os << "place[" << kernel_key.place_ << "]:data_type[" << kernel_key.data_type_
+     << "]";
+  return os;
+}
+
+bool OpSupportGPU(const std::string& op_type) {
+  auto& all_kernels = OperatorWithKernel::AllOpKernels();
+  auto it = all_kernels.find(op_type);
+  if (it == all_kernels.end()) {
+    // All control operator must support GPU
+    return true;
+  }
+  for (auto& kern_pair : it->second) {
+    if (platform::is_gpu_place(kern_pair.first.place_)) {
+      return true;
+    }
+  }
+  return false;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 79bda2e2f9173ab632307bc52167d7d8c17d4418..0d0304ac9e13089ef533b0a47f0ec989c8fd7078 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -20,10 +20,13 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
-#include "op_info.h"
+#include "glog/logging.h"  // For VLOG
 #include "paddle/framework/attribute.h"
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/data_type.h"
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
 #include "paddle/framework/scope.h"
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/tensor.h"
@@ -55,7 +58,6 @@ inline std::string GradVarName(const std::string& var_name) {
 }
 
 class OperatorBase;
-class InferShapeContext;
 class ExecutionContext;
 
 extern const Tensor* GetTensorFromVar(const Variable* var);
@@ -141,9 +143,9 @@ class OperatorBase {
 // Macro for define a clone method.
 // If you are writing an kernel operator, `Clone` will be defined when you
 // register it. i.e. `Clone` method is not needed to define by yourself.
-#define DEFINE_OP_CLONE_METHOD(cls)                       \
-  std::unique_ptr<OperatorBase> Clone() const final {     \
-    return std::unique_ptr<OperatorBase>(new cls(*this)); \
+#define DEFINE_OP_CLONE_METHOD(cls)                                            \
+  std::unique_ptr<::paddle::framework::OperatorBase> Clone() const final {     \
+    return std::unique_ptr<::paddle::framework::OperatorBase>(new cls(*this)); \
   }
 
 // Macro for define a default constructor for Operator.
@@ -167,10 +169,11 @@ class NOP : public OperatorBase {
   }
 };
 
-class InferShapeContext {
+class ExecutionContext {
  public:
-  InferShapeContext(const OperatorBase& op, const Scope& scope)
-      : op_(op), scope_(scope) {}
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : op_(op), scope_(scope), device_context_(device_context) {}
 
   const OperatorBase& op() const { return op_; }
 
@@ -276,101 +279,234 @@ class InferShapeContext {
     out_tensor->set_lod(in_tensor.lod());
   }
 
+  template <typename PlaceType,
+            typename DeviceType = typename platform::EigenDeviceConverter<
+                PlaceType>::EigenDeviceType>
+  DeviceType& GetEigenDevice() const;
+
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+
+  const platform::DeviceContext& device_context() const {
+    return device_context_;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  const platform::CUDADeviceContext& cuda_device_context() const {
+    PADDLE_ENFORCE(platform::is_gpu_place(device_context_.GetPlace()));
+    auto cuda_ctx =
+        reinterpret_cast<const platform::CUDADeviceContext*>(&device_context_);
+    return *cuda_ctx;
+  }
+#endif
+
  private:
   const OperatorBase& op_;
   const Scope& scope_;
+  const platform::DeviceContext& device_context_;
 };
 
 template <>
-const Tensor* InferShapeContext::Input<Tensor>(const std::string& name) const;
+const Tensor* ExecutionContext::Input<Tensor>(const std::string& name) const;
 
 template <>
-const std::vector<const Tensor*> InferShapeContext::MultiInput<Tensor>(
+const std::vector<const Tensor*> ExecutionContext::MultiInput<Tensor>(
     const std::string& name) const;
 
 template <>
-Tensor* InferShapeContext::Output<Tensor>(const std::string& name) const;
+Tensor* ExecutionContext::Output<Tensor>(const std::string& name) const;
 
 template <>
-std::vector<Tensor*> InferShapeContext::MultiOutput<Tensor>(
+std::vector<Tensor*> ExecutionContext::MultiOutput<Tensor>(
     const std::string& name) const;
 
-template <typename T>
-struct EigenDeviceConverter;
+class CompileTimeInferShapeContext : public InferShapeContext {
+ public:
+  CompileTimeInferShapeContext(const OpDescBind& op, const BlockDescBind& block)
+      : op_(op), block_(block) {}
+
+  bool HasInput(const std::string& name) const override {
+    const std::vector<std::string>& input_names = op_.Input(name);
+    auto length = input_names.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input(%s) should have only one value, "
+                      "but it have %d now",
+                      name, length);
+    return block_.HasVarRecursive(input_names[0]);
+  }
 
-template <>
-struct EigenDeviceConverter<platform::CPUPlace> {
-  using EigenDeviceType = Eigen::DefaultDevice;
-};
+  bool HasOutput(const std::string& name) const override {
+    const std::vector<std::string>& output_names = op_.Output(name);
+    auto length = output_names.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output(%s) should have only one value, "
+                      "but it have %d now",
+                      name, length);
+    return block_.HasVarRecursive(output_names[0]);
+  }
 
-#ifndef PADDLE_ONLY_CPU
-template <>
-struct EigenDeviceConverter<platform::GPUPlace> {
-  using EigenDeviceType = Eigen::GpuDevice;
-};
-#endif
+  bool HasInputs(const std::string& name) const override {
+    const std::vector<std::string>& input_names = op_.Input(name);
+    if (input_names.empty()) {
+      return false;
+    }
+    for (auto& input : input_names) {
+      if (!block_.HasVarRecursive(input)) return false;
+    }
+    return true;
+  }
 
-class ExecutionContext : public InferShapeContext {
- public:
-  ExecutionContext(const OperatorBase& op, const Scope& scope,
-                   const platform::DeviceContext& device_context)
-      : InferShapeContext(op, scope), device_context_(device_context) {}
+  bool HasOutputs(const std::string& name) const override {
+    const std::vector<std::string>& output_names = op_.Output(name);
+    if (output_names.empty()) {
+      return false;
+    }
+    for (auto& output : output_names) {
+      if (!block_.HasVarRecursive(output)) return false;
+    }
+    return true;
+  }
 
-  template <typename PlaceType,
-            typename DeviceType =
-                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
-  DeviceType& GetEigenDevice() const;
+  DDim GetInputDim(const std::string& name) const override {
+    std::vector<DDim> ddims = GetInputsDim(name);
+    auto length = ddims.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Input(%s) should have 1 value, "
+                      "but it has %d now",
+                      name, length);
+    return ddims[0];
+  }
 
-  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+  void SetInputDim(const std::string& name, const DDim& dim) override {
+    SetInputsDim(name, {dim});
+  }
 
-  const platform::DeviceContext& device_context() const {
-    return device_context_;
+  DDim GetOutputDim(const std::string& name) const override {
+    std::vector<DDim> ddims = GetOutputsDim(name);
+    auto length = ddims.size();
+    PADDLE_ENFORCE_EQ(length, 1UL,
+                      "Output(%s) should have 1 value, "
+                      "but it has %d now",
+                      name, length);
+    return ddims[0];
+  }
+
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
+    SetOutputsDim(name, {dim});
+  }
+
+  AttrReader Attrs() const override { return AttrReader(op_.GetAttrMap()); }
+
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
+    return op_.Input(name);
+  }
+
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
+    return op_.Output(name);
   }
 
  private:
-  const platform::DeviceContext& device_context_;
+  DDim GetDim(const std::string& name) const override {
+    return framework::make_ddim(block_.FindVarRecursive(name)->Shape());
+  }
+
+  void SetDim(const std::string& name, const DDim& dim) override {
+    block_.FindVarRecursive(name)->SetShape(framework::vectorize(dim));
+  }
+
+  const OpDescBind& op_;
+  const BlockDescBind& block_;
 };
 
-class RuntimeInferShapeContext : public InferShapeContextBase {
+class RuntimeInferShapeContext : public InferShapeContext {
  public:
   RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope)
       : op_(op), scope_(scope) {}
 
-  bool HasInput(const std::string& name) const {
-    auto ipt = op_.Input(name);
+  bool HasInput(const std::string& name) const override {
+    auto& ins = Inputs(name);
+    size_t length = ins.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Input %s should have more than one inputs",
+                      name);
+    auto ipt = ins[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
   }
 
-  bool HasOutput(const std::string& name) const {
-    auto ipt = op_.Output(name);
+  bool HasOutput(const std::string& name) const override {
+    auto& outs = Outputs(name);
+    size_t length = outs.size();
+    if (length == 0) {
+      return false;
+    }
+    PADDLE_ENFORCE_EQ(length, 1UL, "Output %s should have more than one inputs",
+                      name);
+    auto ipt = outs[0];
     auto* var = ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt);
     return var != nullptr;
   }
 
-  DDim GetInputDim(const std::string& name) const {
+  bool HasInputs(const std::string& name) const override {
+    auto inputs = op_.Inputs(name);
+    if (inputs.empty()) {
+      return false;
+    }
+    for (auto& input : inputs) {
+      if (scope_.FindVar(input) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  bool HasOutputs(const std::string& name) const override {
+    auto outputs = op_.Outputs(name);
+    if (outputs.empty()) {
+      return false;
+    }
+    for (auto& output : outputs) {
+      if (scope_.FindVar(output) == nullptr) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  DDim GetInputDim(const std::string& name) const override {
     return GetDim(op_.Input(name));
   }
 
-  void SetInputDim(const std::string& name, const DDim& dim) {
+  void SetInputDim(const std::string& name, const DDim& dim) override {
     SetDim(op_.Input(name), dim);
   }
 
-  DDim GetOutputDim(const std::string& name) const {
+  DDim GetOutputDim(const std::string& name) const override {
     return GetDim(op_.Output(name));
   }
 
-  void SetOutputDim(const std::string& name, const DDim& dim) {
+  void SetOutputDim(const std::string& name, const DDim& dim) override {
     SetDim(op_.Output(name), dim);
   }
 
-  AttrReader Attrs() const { return AttrReader(op_.Attrs()); }
+  AttrReader Attrs() const override { return AttrReader(op_.Attrs()); }
 
-  const std::vector<std::string>& Inputs(const std::string& name) const {
+  const std::vector<std::string>& Inputs(
+      const std::string& name) const override {
     return op_.Inputs(name);
   }
 
-  const std::vector<std::string>& Outputs(const std::string& name) const {
+  const std::vector<std::string>& Outputs(
+      const std::string& name) const override {
     return op_.Outputs(name);
   }
 
@@ -391,11 +527,11 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
     return t;
   }
 
-  DDim GetDim(const std::string& name) const {
+  DDim GetDim(const std::string& name) const override {
     return GetTensor<false>(name)->dims();
   }
 
-  void SetDim(const std::string& name, const DDim& dim) {
+  void SetDim(const std::string& name, const DDim& dim) override {
     GetTensor<true>(name)->Resize(dim);
   }
 
@@ -403,7 +539,7 @@ class RuntimeInferShapeContext : public InferShapeContextBase {
   const Scope& scope_;
 };
 
-class OpKernel {
+class OpKernelBase {
  public:
   /**
    * ExecutionContext is the only parameter of Kernel Run function.
@@ -414,33 +550,47 @@ class OpKernel {
 
   virtual void Compute(const ExecutionContext& context) const = 0;
 
-  virtual ~OpKernel() {}
+  virtual ~OpKernelBase() = default;
+};
+
+template <typename T>
+class OpKernel : public OpKernelBase {
+ public:
+  using ELEMENT_TYPE = T;
 };
 
 class OperatorWithKernel : public OperatorBase {
  public:
   struct OpKernelKey {
     platform::Place place_;
+    DataType data_type_;
 
-    OpKernelKey() = default;
-    explicit OpKernelKey(const platform::DeviceContext& dev_ctx) {
-      place_ = dev_ctx.GetPlace();
-    }
+    OpKernelKey(DataType data_type, platform::Place place)
+        : place_(place), data_type_(data_type) {}
+
+    OpKernelKey(DataType data_type, const platform::DeviceContext& dev_ctx)
+        : place_(dev_ctx.GetPlace()), data_type_(data_type) {}
 
     bool operator==(const OpKernelKey& o) const {
-      return platform::places_are_same_class(place_, o.place_);
+      return platform::places_are_same_class(place_, o.place_) &&
+             data_type_ == o.data_type_;
     }
   };
 
   struct OpKernelHash {
-    std::hash<bool> hash_;
+    std::hash<int> hash_;
     size_t operator()(const OpKernelKey& key) const {
-      return hash_(platform::is_gpu_place(key.place_));
+      int place = key.place_.which();
+      int data_type = static_cast<int>(key.data_type_);
+      int pre_hash = data_type << NUM_PLACE_TYPE_LIMIT_IN_BIT |
+                     (place & ((1 << NUM_PLACE_TYPE_LIMIT_IN_BIT) - 1));
+      return hash_(pre_hash);
     }
   };
 
   using OpKernelMap =
-      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernelBase>,
+                         OpKernelHash>;
 
   OperatorWithKernel(const std::string& type, const VariableNameMap& inputs,
                      const VariableNameMap& outputs, const AttributeMap& attrs)
@@ -448,11 +598,30 @@ class OperatorWithKernel : public OperatorBase {
 
   void Run(const Scope& scope,
            const platform::DeviceContext& dev_ctx) const final {
+    VLOG(3) << "Running operator " << this->Type();
     RuntimeInferShapeContext infer_shape_ctx(*this, scope);
     this->InferShape(&infer_shape_ctx);
 
-    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(*this, scope, dev_ctx));
+    ExecutionContext ctx(*this, scope, dev_ctx);
+
+    // check if op[type] has kernel registered.
+    auto& all_op_kernels = AllOpKernels();
+    auto kernels_iter = all_op_kernels.find(type_);
+    if (kernels_iter == all_op_kernels.end()) {
+      PADDLE_THROW("op[%s] has no kernel", type_);
+    }
+
+    // check if op[type] have kernel for kernel_key
+    OpKernelMap& kernels = kernels_iter->second;
+    auto kernel_key = OpKernelKey(IndicateDataType(ctx), dev_ctx);
+    auto kernel_iter = kernels.find(kernel_key);
+
+    if (kernel_iter == kernels.end()) {
+      PADDLE_THROW("op[%s] has no kernel with kernel_key[%s]", type_,
+                   kernel_key);
+    }
+
+    kernel_iter->second->Compute(ctx);
   }
 
   static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -462,14 +631,49 @@ class OperatorWithKernel : public OperatorBase {
   }
 
   bool SupportGPU() const override {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+    auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+    return std::any_of(op_kernels.begin(), op_kernels.end(),
+                       [](OpKernelMap::const_reference kern_pair) {
+                         return platform::is_gpu_place(kern_pair.first.place_);
+                       });
   }
 
+  virtual void InferShape(InferShapeContext* ctx) const = 0;
+
  protected:
-  virtual void InferShape(InferShapeContextBase* ctx) const = 0;
+  // indicate kernel DataType by input data. Defaultly all input data must be
+  // same.
+  virtual DataType IndicateDataType(const ExecutionContext& ctx) const {
+    auto& scope = ctx.scope();
+    int data_type = -1;
+    for (auto& input : this->inputs_) {
+      for (auto& ipt_name : input.second) {
+        auto* var = scope.FindVar(ipt_name);
+        if (var != nullptr) {
+          const Tensor* t = nullptr;
+          if (var->IsType<Tensor>()) {
+            t = &var->Get<Tensor>();
+          } else if (var->IsType<LoDTensor>()) {
+            t = &var->Get<LoDTensor>();
+          }
+          if (t != nullptr) {
+            int tmp = static_cast<int>(ToDataType(t->type()));
+            PADDLE_ENFORCE(tmp == data_type || data_type == -1,
+                           "DataType of Paddle Op must be same.");
+            data_type = tmp;
+          }
+        }
+      }
+    }
+    PADDLE_ENFORCE(data_type != -1, "DataType should be indicated by input");
+    return static_cast<DataType>(data_type);
+  }
 };
 
+std::ostream& operator<<(std::ostream& os,
+                         const OperatorWithKernel::OpKernelKey& kernel_key);
+
+extern bool OpSupportGPU(const std::string& op_type);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index e1d8f040b837a6ad598351dae0427cc7c231e79f..c358f1a2b6ee3174b8c336ba1d212be7c5aa15c6 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -83,8 +83,8 @@ TEST(OperatorBase, all) {
   paddle::platform::CPUDeviceContext device_context;
   paddle::framework::Scope scope;
 
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  scope.NewVar("OUT1");
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
+  scope.Var("OUT1");
   ASSERT_EQ(paddle::framework::op_run_num, 0);
   op->Run(scope, device_context);
   ASSERT_EQ(paddle::framework::op_run_num, 1);
@@ -113,11 +113,14 @@ class OpWithKernelTest : public OperatorWithKernel {
   using OperatorWithKernel::OperatorWithKernel;
 
  protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {}
+  void InferShape(framework::InferShapeContext* ctx) const override {}
+  DataType IndicateDataType(const ExecutionContext& ctx) const override {
+    return DataType::FP32;
+  }
 };
 
 template <typename T1, typename T2>
-class CPUKernelTest : public OpKernel {
+class CPUKernelTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
     std::cout << "this is cpu kernel" << std::endl;
@@ -144,7 +147,7 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker
   }
 };
 
-class CPUKernalMultiInputsTest : public OpKernel {
+class CPUKernalMultiInputsTest : public OpKernel<float> {
  public:
   void Compute(const ExecutionContext& ctx) const {
     auto xs = ctx.op().Inputs("xs");
@@ -205,7 +208,7 @@ TEST(OpKernel, all) {
   paddle::platform::CPUDeviceContext cpu_device_context;
   paddle::framework::Scope scope;
 
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
   op->Run(scope, cpu_device_context);
   ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
@@ -234,14 +237,14 @@ TEST(OpKernel, multi_inputs) {
 
   paddle::platform::CPUDeviceContext cpu_device_context;
   paddle::framework::Scope scope;
-  scope.NewVar("x0")->GetMutable<Tensor>();
-  scope.NewVar("x1")->GetMutable<Tensor>();
-  scope.NewVar("x2")->GetMutable<Tensor>();
-  scope.NewVar("k0")->GetMutable<Tensor>();
-  scope.NewVar("y0")->GetMutable<Tensor>();
-  scope.NewVar("y1")->GetMutable<Tensor>();
-
-  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope.Var("x0")->GetMutable<Tensor>();
+  scope.Var("x1")->GetMutable<Tensor>();
+  scope.Var("x2")->GetMutable<Tensor>();
+  scope.Var("k0")->GetMutable<Tensor>();
+  scope.Var("y0")->GetMutable<Tensor>();
+  scope.Var("y1")->GetMutable<Tensor>();
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
   op->Run(scope, cpu_device_context);
 }
 
diff --git a/paddle/framework/program_desc.cc b/paddle/framework/program_desc.cc
index e89f9a46d587b6378aa3be92306c5680093e1926..8e99bba81117c9cc50227122527d6ab9a421c251 100644
--- a/paddle/framework/program_desc.cc
+++ b/paddle/framework/program_desc.cc
@@ -18,42 +18,34 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-using ProgDescMap =
-    std::unordered_map<ProgramDesc *, std::unique_ptr<ProgramDescBind>>;
-static ProgDescMap *g_bind_map = nullptr;
-
-ProgramDescBind &ProgramDescBind::Instance(ProgramDesc *prog) {
-  if (g_bind_map == nullptr) {
-    g_bind_map = new ProgDescMap();
-  }
-  auto &map = *g_bind_map;
-  auto &ptr = map[prog];
-
-  if (ptr == nullptr) {
-    ptr.reset(new ProgramDescBind(prog));
-  }
-  return *ptr;
-}
-
 BlockDescBind *ProgramDescBind::AppendBlock(const BlockDescBind &parent) {
-  auto *b = prog_->add_blocks();
+  auto *b = prog_.add_blocks();
   b->set_parent_idx(parent.ID());
-  b->set_idx(prog_->blocks_size() - 1);
+  b->set_idx(prog_.blocks_size() - 1);
   blocks_.emplace_back(new BlockDescBind(this, b));
   return blocks_.back().get();
 }
 
 ProgramDesc *ProgramDescBind::Proto() {
   for (auto &block : blocks_) {
-    block->Sync();
+    block->Flush();
   }
-  return prog_;
+  return &prog_;
+}
+
+ProgramDescBind::ProgramDescBind() {
+  auto *block = prog_.mutable_blocks()->Add();
+  block->set_idx(kRootBlockIndex);
+  block->set_parent_idx(kNoneBlockIndex);
+  blocks_.emplace_back(new BlockDescBind(this, block));
 }
 
-ProgramDescBind::ProgramDescBind(ProgramDesc *prog) {
-  prog_ = prog;
-  for (auto &block : *prog->mutable_blocks()) {
-    blocks_.emplace_back(new BlockDescBind(this, &block));
+ProgramDescBind::ProgramDescBind(const ProgramDescBind &o) {
+  prog_ = o.prog_;
+
+  for (int i = 0; i < prog_.blocks_size(); ++i) {
+    auto *block = prog_.mutable_blocks(i);
+    blocks_.emplace_back(new BlockDescBind(*o.blocks_[i], block, this));
   }
 }
 }  // namespace framework
diff --git a/paddle/framework/program_desc.h b/paddle/framework/program_desc.h
index 06ffcd4b15078f62ea8b7a3714e73de799530785..dc4cd7cc735b5e4e3466d9b82dc5eb8647c80ef9 100644
--- a/paddle/framework/program_desc.h
+++ b/paddle/framework/program_desc.h
@@ -14,8 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <vector>
 #include "paddle/framework/framework.pb.h"
+#include "paddle/framework/proto_desc.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -24,26 +27,20 @@ class BlockDescBind;
 
 class ProgramDescBind {
  public:
-  static ProgramDescBind &Instance(ProgramDesc *prog);
+  ProgramDescBind();
 
-  ProgramDescBind(const ProgramDescBind &o) = delete;
-  ProgramDescBind &operator=(const ProgramDescBind &o) = delete;
+  ProgramDescBind(const ProgramDescBind &o);
 
   BlockDescBind *AppendBlock(const BlockDescBind &parent);
 
   BlockDescBind *Block(size_t idx) { return blocks_[idx].get(); }
 
-  std::string DebugString() { return Proto()->DebugString(); }
-
   size_t Size() const { return blocks_.size(); }
 
   ProgramDesc *Proto();
 
  private:
-  explicit ProgramDescBind(ProgramDesc *prog);
-
-  // Not owned
-  ProgramDesc *prog_;
+  ProgramDesc prog_;
 
   std::vector<std::unique_ptr<BlockDescBind>> blocks_;
 };
diff --git a/paddle/framework/program_desc_test.cc b/paddle/framework/program_desc_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9709a2d3f1d9e0be2bda1e8e9e7835ca49141b1
--- /dev/null
+++ b/paddle/framework/program_desc_test.cc
@@ -0,0 +1,83 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/program_desc.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/block_desc.h"
+
+namespace paddle {
+namespace framework {
+TEST(ProgramDesc, copy_ctor) {
+  ProgramDescBind program;
+  auto* global_block = program.Block(0);
+  auto* x = global_block->Var("X");
+  x->SetType(VarDesc_VarType_LOD_TENSOR);
+  x->SetLoDLevel(0);
+  x->SetDataType(FP32);
+  x->SetShape({1000, 784});
+
+  auto* y = global_block->Var("Y");
+  y->SetType(VarDesc_VarType_LOD_TENSOR);
+  y->SetLoDLevel(0);
+  y->SetDataType(FP32);
+  y->SetShape({784, 100});
+
+  auto* op = global_block->AppendOp();
+  op->SetType("mul");
+  op->SetInput("X", {x->Name()});
+  op->SetInput("Y", {y->Name()});
+
+  auto* out = global_block->Var("Out");
+  out->SetType(VarDesc_VarType_LOD_TENSOR);
+  op->SetOutput("Y", {out->Name()});
+
+  ProgramDescBind program_copy(program);
+
+  auto* global_block_copy = program_copy.Block(0);
+  ASSERT_NE(global_block, global_block_copy);
+
+  auto assert_same_var = [&](const std::string& name, VarDescBind* var_before) {
+    ASSERT_TRUE(global_block_copy->HasVar(name));
+    auto* copy = global_block_copy->Var(name);
+    ASSERT_NE(copy, var_before);
+    ASSERT_EQ(copy->Name(), var_before->Name());
+    ASSERT_EQ(copy->GetType(), var_before->GetType());
+    ASSERT_EQ(copy->Shape(), var_before->Shape());
+    ASSERT_EQ(copy->Proto()->SerializeAsString(),
+              var_before->Proto()->SerializeAsString());
+  };
+
+  ASSERT_EQ(global_block->LocalVarNames(), global_block_copy->LocalVarNames());
+  ASSERT_EQ(3, global_block_copy->LocalVarNames().size());
+  assert_same_var("X", x);
+  assert_same_var("Y", y);
+  assert_same_var("Out", out);
+
+  for (size_t i = 0; i < global_block->OpSize(); ++i) {
+    auto op_origin = global_block->Op(i);
+    auto op_copy = global_block->Op(i);
+
+    ASSERT_EQ(op_origin->Type(), op_copy->Type());
+    ASSERT_EQ(op_origin->Inputs(), op_copy->Inputs());
+    ASSERT_EQ(op_origin->Outputs(), op_copy->Outputs());
+
+    ASSERT_EQ(op_copy->Proto()->SerializeAsString(),
+              op_origin->Proto()->SerializeAsString());
+  }
+
+  // Not check block's protostr are same it because the order of vars could be
+  // different and it is correct.
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/proto_desc.h b/paddle/framework/proto_desc.h
new file mode 100644
index 0000000000000000000000000000000000000000..fa01224fefce50eb3688ff407f0a7c948c5b7cfc
--- /dev/null
+++ b/paddle/framework/proto_desc.h
@@ -0,0 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+
+// The Index of first Block in Program. also called root block.
+constexpr int kRootBlockIndex = 0;
+// The Parent Index of root Block, this block does not exist.
+constexpr int kNoneBlockIndex = -1;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/prune.cc b/paddle/framework/prune.cc
new file mode 100644
index 0000000000000000000000000000000000000000..95833692925af4477fe575d6bd908a2ce7653c1b
--- /dev/null
+++ b/paddle/framework/prune.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/prune.h"
+
+#include <algorithm>
+#include <set>
+#include <string>
+#include <vector>
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+const std::string kFeedOpType = "feed";
+const std::string kFetchOpType = "fetch";
+
+bool HasDependentVar(const OpDesc& op_desc,
+                     const std::set<std::string>& dependent_vars) {
+  for (auto& var : op_desc.outputs()) {
+    for (auto& argu : var.arguments()) {
+      if (dependent_vars.count(argu) != 0) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool IsTarget(const OpDesc& op_desc) {
+  if (op_desc.has_is_target()) {
+    return op_desc.is_target();
+  }
+  return false;
+}
+
+void prune_impl(const ProgramDesc& input, ProgramDesc& output, int block_id) {
+  // TODO(tonyyang-svail):
+  //    - will change to use multiple blocks for RNN op and Cond Op
+
+  auto& block = input.blocks(block_id);
+  auto& ops = block.ops();
+
+  bool expect_feed = true;
+  for (auto& op_desc : ops) {
+    PADDLE_ENFORCE(op_desc.type() != kFeedOpType || expect_feed,
+                   "All FeedOps are at the beginning of the ProgramDesc");
+    expect_feed = (op_desc.type() == kFeedOpType);
+  }
+
+  bool expect_fetch = true;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+    PADDLE_ENFORCE(op_desc.type() != kFetchOpType || expect_fetch,
+                   "All FetchOps must at the end of the ProgramDesc");
+    expect_fetch = (op_desc.type() == kFetchOpType);
+  }
+
+  std::set<std::string> dependent_vars;
+  std::vector<bool> should_run;
+  for (auto op_iter = ops.rbegin(); op_iter != ops.rend(); ++op_iter) {
+    auto& op_desc = *op_iter;
+
+    if (IsTarget(op_desc) || HasDependentVar(op_desc, dependent_vars)) {
+      // insert its input to the dependency graph
+      for (auto& var : op_desc.inputs()) {
+        for (auto& argu : var.arguments()) {
+          dependent_vars.insert(argu);
+        }
+      }
+
+      should_run.push_back(true);
+    } else {
+      should_run.push_back(false);
+    }
+  }
+
+  // since we are traversing the ProgramDesc in reverse order
+  // we reverse the should_run vector
+  std::reverse(should_run.begin(), should_run.end());
+
+  output = input;
+  auto* op_field = output.mutable_blocks(block_id)->mutable_ops();
+  op_field->Clear();
+  for (size_t i = 0; i < should_run.size(); ++i) {
+    if (should_run[i]) {
+      *op_field->Add() = input.blocks(block_id).ops(i);
+    }
+  }
+}
+
+void Prune(const ProgramDesc& input, ProgramDesc& output) {
+  prune_impl(input, output, 0);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/prune.h
similarity index 83%
rename from paddle/framework/grad_op_builder.h
rename to paddle/framework/prune.h
index 998f8ebbb5f2f4fb8b7e938b5916afd0f8a7930d..9414ac64f9491c07aabb216a4c81dfe6e78e8043 100644
--- a/paddle/framework/grad_op_builder.h
+++ b/paddle/framework/prune.h
@@ -14,12 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/framework/operator.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
-OperatorBase* BuildGradOp(const OperatorBase* op);
+void Prune(const ProgramDesc& input, ProgramDesc& output);
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/prune_test.cc b/paddle/framework/prune_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3ab4b43d9256af5880083b00df446c451e3f598b
--- /dev/null
+++ b/paddle/framework/prune_test.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/prune.h"
+
+#include "paddle/framework/attribute.h"
+#include "paddle/framework/operator.h"
+#include "paddle/operators/net_op.h"
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/program_desc.h"
+
+#include <gtest/gtest.h>
+
+namespace f = paddle::framework;
+namespace ops = paddle::operators;
+
+void AddOp(const std::string &type, const f::VariableNameMap &inputs,
+           const f::VariableNameMap &outputs, f::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+TEST(Prune, one_operator) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
+
+  f::ProgramDesc *pdesc = program.Proto();
+  f::ProgramDesc pruned;
+
+  Prune(*pdesc, pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 0);
+
+  pdesc->mutable_blocks(0)->mutable_ops(0)->set_is_target(true);
+  Prune(*pdesc, pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 1);
+}
+
+TEST(Prune, forward) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+
+  AddOp("one_one", {{"input", {"a"}}}, {{"output", {"b"}}}, {}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"c"}}}, {}, block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"d"}}}, {}, block);
+  AddOp("one_one", {{"input", {"d"}}}, {{"output", {"e"}}}, {}, block);
+
+  f::ProgramDesc *pdesc = program.Proto();
+
+  for (int i = 0; i < pdesc->blocks(0).ops_size(); ++i) {
+    f::ProgramDesc pruned;
+    pdesc->mutable_blocks(0)->mutable_ops(i)->set_is_target(true);
+    Prune(*pdesc, pruned);
+    PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), i + 1);
+  }
+}
+
+TEST(Prune, multi_input_op) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+
+  AddOp("one_one", {{"input", {"a0"}}}, {{"output", {"b0"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a1"}}}, {{"output", {"b1"}}}, {}, block);
+  AddOp("one_one", {{"input", {"a2"}}}, {{"output", {"b2"}}}, {}, block);
+  AddOp("three_one", {{"input", {"b0", "b1", "b2"}}}, {{"output", {"c"}}}, {},
+        block);
+
+  f::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(3)->set_is_target(true);
+
+  f::ProgramDesc pruned;
+  Prune(*pdesc, pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 4);
+}
+
+TEST(Prune, multi_output_op) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+
+  f::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
+
+  f::ProgramDesc pruned;
+  Prune(*pdesc, pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 2);
+}
+
+TEST(Prune, multi_target) {
+  f::ProgramDescBind program;
+  f::BlockDescBind *block = program.Block(0);
+
+  AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}}, {}, block);
+  AddOp("one_one", {{"input", {"b"}}}, {{"output", {"b1"}}}, {}, block);
+  AddOp("one_one", {{"input", {"c"}}}, {{"output", {"c1"}}}, {}, block);
+
+  f::ProgramDesc *pdesc = program.Proto();
+  pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
+  pdesc->mutable_blocks(0)->mutable_ops(2)->set_is_target(true);
+
+  f::ProgramDesc pruned;
+  Prune(*pdesc, pruned);
+  PADDLE_ENFORCE_EQ(pruned.blocks(0).ops_size(), 3);
+}
diff --git a/paddle/framework/saver.proto b/paddle/framework/saver.proto
new file mode 100644
index 0000000000000000000000000000000000000000..90a191a6a79250761489b68916b1fa09116830f2
--- /dev/null
+++ b/paddle/framework/saver.proto
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax = "proto2";
+option optimize_for = LITE_RUNTIME;
+package paddle.framework;
+
+import "framework.proto";
+
+/**
+ * This file contains necessary information for model, checkpoint.
+ * etc.
+ */
+
+message LoDInfo { repeated int64 level = 1; }
+
+/**
+ * Save the LoDTensorDesc information through LoDTensorProto, its data memory
+ * is copyed to c buffer immediately. See model_format.md for details.
+ */
+
+message LoDTensorProto {
+  optional DataType data_type = 1;
+  repeated int64 dims = 2; // [UNK, 640, 480] is saved as [-1, 640, 480]
+  repeated LoDInfo levels = 3;
+  optional int32 lod_level = 4 [ default = 0 ];
+  optional int32 version = 5;
+}
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 080b4ac621c1b8c0d4b4e7b26f394cf2be263894..19e25fba05f2f1c959da32c950320d3a44d5109d 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/scope.h"
+
+#include <memory>  // for unique_ptr
+#include <mutex>   // for call_once
 #include "paddle/string/printf.h"
 
 namespace paddle {
@@ -28,7 +31,7 @@ Scope& Scope::NewScope() const {
   return *kids_.back();
 }
 
-Variable* Scope::NewVar(const std::string& name) {
+Variable* Scope::Var(const std::string& name) {
   auto iter = vars_.find(name);
   if (iter != vars_.end()) {
     return iter->second;
@@ -39,8 +42,8 @@ Variable* Scope::NewVar(const std::string& name) {
   return v;
 }
 
-Variable* Scope::NewVar() {
-  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+Variable* Scope::Var() {
+  return Var(string::Sprintf("%p.%d", this, vars_.size()));
 }
 
 Variable* Scope::FindVar(const std::string& name) const {
@@ -62,5 +65,29 @@ void Scope::DropKids() {
   kids_.clear();
 }
 
+std::vector<std::string> Scope::GetAllNames(bool recursive) const {
+  std::vector<std::string> known_vars(vars_.size());
+
+  if (recursive) {
+    for (auto& kid : kids_) {
+      auto kid_vars = kid->GetAllNames();
+      for (auto& p : kid_vars) {
+        known_vars.emplace_back(p);
+      }
+    }
+  }
+  for (auto& p : vars_) {
+    known_vars.emplace_back(p.first);
+  }
+  return known_vars;
+}
+
+void Scope::DeleteScope(Scope* scope) {
+  auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
+  PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
+  this->kids_.erase(it);
+  delete scope;
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index c93b03e48130afe9568089b6a7586c4185d1d5b4..ac334da5ef0c8ad563b6be5413df33f5d0bdbcf8 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -17,8 +17,10 @@ limitations under the License. */
 #include <list>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "paddle/framework/variable.h"
+#include "paddle/platform/macros.h"
 
 namespace paddle {
 namespace framework {
@@ -38,21 +40,16 @@ class Scope {
   Scope() {}
   ~Scope();
 
-  // Disable Copy, Assign, Move.
-  Scope(const Scope& other) = delete;
-  Scope& operator=(const Scope& other) = delete;
-  Scope(Scope&& other) = delete;
-
   /// Create a sub-scope. Returns a reference other than a pointer so
   /// to prevent from manual deletion.
   /// Mark it to const because that new kid scope cannot change parent scope.
   Scope& NewScope() const;
 
   /// Create a variable with given name if it doesn't exist.
-  Variable* NewVar(const std::string& name);
+  Variable* Var(const std::string& name);
 
   /// Create a variable with a scope-unique name.
-  Variable* NewVar();
+  Variable* Var();
 
   /// Find a variable in the scope or any of its ancestors.  Returns
   /// nullptr if cannot find.
@@ -63,9 +60,14 @@ class Scope {
   /// Find the scope or an ancestor scope that contains the given variable.
   const Scope* FindScope(const Variable* var) const;
 
+  void DeleteScope(Scope* scope);
+
   /// Drop all kids scopes belonged to this scope.
   void DropKids();
 
+  // enumerate all the variables current contains.
+  std::vector<std::string> GetAllNames(bool recursive = false) const;
+
  private:
   // Call Scope::NewScope for a sub-scope.
   explicit Scope(Scope const* parent) : parent_(parent) {}
@@ -73,7 +75,8 @@ class Scope {
   std::unordered_map<std::string, Variable*> vars_;
   mutable std::list<Scope*> kids_;
   Scope const* parent_{nullptr};
-};
 
+  DISABLE_COPY_AND_ASSIGN(Scope);
+};
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index 9d51e355b0f6336d2f875ff2d77266b261baf5ac..f738d5ba9ecda57ea25bb5f84057d1d0106eef66 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/scope.h"
+#include "glog/logging.h"
 #include "gtest/gtest.h"
 
 using paddle::framework::Scope;
@@ -23,8 +24,8 @@ TEST(Scope, VarsShadowing) {
   Scope& ss1 = s.NewScope();
   Scope& ss2 = s.NewScope();
 
-  Variable* v0 = s.NewVar("a");
-  Variable* v1 = ss1.NewVar("a");
+  Variable* v0 = s.Var("a");
+  Variable* v1 = ss1.Var("a");
 
   EXPECT_NE(v0, v1);
 
@@ -40,7 +41,7 @@ TEST(Scope, FindVar) {
   EXPECT_EQ(nullptr, s.FindVar("a"));
   EXPECT_EQ(nullptr, ss.FindVar("a"));
 
-  ss.NewVar("a");
+  ss.Var("a");
 
   EXPECT_EQ(nullptr, s.FindVar("a"));
   EXPECT_NE(nullptr, ss.FindVar("a"));
@@ -49,8 +50,22 @@ TEST(Scope, FindVar) {
 TEST(Scope, FindScope) {
   Scope s;
   Scope& ss = s.NewScope();
-  Variable* v = s.NewVar("a");
+  Variable* v = s.Var("a");
 
   EXPECT_EQ(&s, s.FindScope(v));
   EXPECT_EQ(&s, ss.FindScope(v));
 }
+
+TEST(Scope, GetAllNames) {
+  Scope s;
+  Variable* v = s.Var("a");
+  EXPECT_EQ(&s, s.FindScope(v));
+
+  std::vector<std::string> ans = s.GetAllNames();
+  std::string str;
+  for (auto& var : ans) {
+    str += var;
+  }
+
+  EXPECT_STREQ("a", str.c_str());
+}
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c74459c9dd7006a24615b1d6df041583088fb25c
--- /dev/null
+++ b/paddle/framework/selected_rows.cc
@@ -0,0 +1,16 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+
+namespace paddle {
+namespace framework {}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd9078137132669c7265ce3972f2c6df996fa366
--- /dev/null
+++ b/paddle/framework/selected_rows.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRows {
+ public:
+  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
+      : rows_(rows), height_(height) {
+    value_.reset(new Tensor());
+  }
+
+  SelectedRows() { value_.reset(new Tensor()); }
+
+  platform::Place place() const { return value_->place(); }
+
+  const Tensor& value() const { return *value_; }
+
+  Tensor* mutable_value() { return value_.get(); }
+
+  int64_t height() const { return height_; }
+
+  void set_height(int64_t height) { height_ = height; }
+
+  const Vector<int64_t>& rows() const { return rows_; }
+
+  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
+
+  DDim GetCompleteDims() const {
+    std::vector<int64_t> dims = vectorize(value_->dims());
+    dims[0] = height_;
+    return make_ddim(dims);
+  }
+
+ private:
+  // Notice: rows can be duplicate. We can have {0, 4, 7, 0, 5, 7, 9} here.
+  // SelectedRows are simplely concated when adding together. Until a
+  // SelectedRows add a Tensor, will the duplicate rows be handled.
+  Vector<int64_t> rows_;
+  std::unique_ptr<Tensor> value_{nullptr};
+  int64_t height_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ee13a65d72e44693573397bb686b355effb2227
--- /dev/null
+++ b/paddle/framework/selected_rows_test.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/selected_rows.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+
+class SelectedRowsTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    std::vector<int64_t> rows{0, 4, 7};
+    int64_t height = 10;
+    int64_t row_numel = 100;
+    selected_rows_.reset(new SelectedRows(rows, height));
+
+    Tensor* value = selected_rows_->mutable_value();
+    value->mutable_data<float>(
+        make_ddim({static_cast<int64_t>(rows.size()), row_numel}), place_);
+  }
+
+ protected:
+  platform::CPUPlace place_;
+  std::unique_ptr<SelectedRows> selected_rows_{nullptr};
+};
+
+TEST_F(SelectedRowsTester, height) { ASSERT_EQ(selected_rows_->height(), 10); }
+
+TEST_F(SelectedRowsTester, dims) {
+  ASSERT_EQ(selected_rows_->value().dims(), make_ddim({3, 100}));
+}
+
+TEST_F(SelectedRowsTester, complete_dims) {
+  ASSERT_EQ(selected_rows_->GetCompleteDims(), make_ddim({10, 100}));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index b07fc788124413f728c713027609d9d2d1c39538..b93f980cf6d279d18388b9637a2ff45d797ca78e 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -19,11 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class InferShapeContextBase {
+class InferShapeContext {
  public:
-  virtual ~InferShapeContextBase() {}
+  virtual ~InferShapeContext() {}
   virtual bool HasInput(const std::string &name) const = 0;
   virtual bool HasOutput(const std::string &name) const = 0;
+
+  virtual bool HasInputs(const std::string &name) const = 0;
+  virtual bool HasOutputs(const std::string &name) const = 0;
+
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const {
     const std::vector<std::string> &names = Inputs(name);
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index f040c09c089ec75c9773d752685be5e232e8f4b7..e31472327dbca45dc12ea2c9e494beddd36860dc 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -29,20 +29,12 @@ limitations under the License. */
 
 namespace paddle {
 
-namespace pybind {
-namespace details {
-template <bool less, size_t i, typename... args>
-struct CastToPyBufferImpl;
-}
-}  // namespace pybind
-
 namespace framework {
 
+class LoDTensor;
+
 class Tensor {
  public:
-  template <bool less, size_t i, typename... args>
-  friend struct pybind::details::CastToPyBufferImpl;
-
   template <typename T, size_t D, int MajorType, typename IndexType>
   friend struct EigenTensor;
 
@@ -70,6 +62,10 @@ class Tensor {
   template <typename T>
   inline T* mutable_data(platform::Place place);
 
+  inline void* mutable_data(platform::Place place, std::type_index type);
+
+  inline void* mutable_data(platform::Place place);
+
   /**
    * @brief     Return a pointer to mutable memory block.
    *
@@ -91,19 +87,35 @@ class Tensor {
   inline Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
-  template <typename T>
   inline Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief   Copy the content of external tensor to a new place.
    *
-   * @param[in] src   The external tensor.
-   * @param[in] ctx   The device context contains place where to store.
+   * @param[in] src        The external tensor.
+   * @param[in] dst_place  The dst place.
+   * @param[in] ctx        The device context contains device resources.
    *
    * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
    */
+  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
+  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
+  // and make them global functions
+  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                       const platform::DeviceContext& ctx);
+
+  /**
+   * @brief   Copy the content of an external vector to a tensor.
+   *
+   * @param[in] src        The external tensor.
+   * @param[in] ctx        The device context contains device resources.
+   *
+   * * @note    CopyFromVector assumes that the tensor has been resized
+   *            before invoking.
+   */
   template <typename T>
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);
+  inline void CopyFromVector(const std::vector<T>& src,
+                             const platform::DeviceContext& ctx);
 
   /**
    * @brief   Return the slice of the tensor.
@@ -111,7 +123,6 @@ class Tensor {
    * @param[in] begin_idx   The begin index of the slice.
    * @param[in] end_idx     The end index of the slice.
    */
-  template <typename T>
   inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
   platform::Place place() const {
@@ -119,30 +130,35 @@ class Tensor {
     return holder_->place();
   }
 
+  std::type_index type() const { return holder_->type(); }
+
  private:
-  template <typename T>
   inline void check_memory_size() const;
 
  private:
+  friend class LoDTensor;
+
   /**
    * @note    Placeholder hides type T, so it doesn't appear as a template
    *          parameter of Variable.
    */
   struct Placeholder {
-    virtual ~Placeholder() {}
+    virtual ~Placeholder() = default;
     virtual void* ptr() const = 0;
     virtual size_t size() const = 0;
     virtual std::type_index type() const = 0;
     virtual platform::Place place() const = 0;
+    virtual void set_type(std::type_index type) = 0;
   };
 
-  template <typename T, typename Place>
+  template <typename Place>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(Place place, size_t size)
-        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               memory::PODDeleter<T, Place>(place)),
+    PlaceholderImpl(Place place, size_t size, std::type_index type)
+        : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)),
+               memory::PODDeleter<uint8_t, Place>(place)),
           place_(place),
-          size_(size) {
+          size_(size),
+          type_(type) {
       PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
                               (is_cpu_place(place_) ? "CPU" : "GPU"));
     }
@@ -150,22 +166,31 @@ class Tensor {
     virtual size_t size() const { return size_; }
     virtual platform::Place place() const { return place_; }
     virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual std::type_index type() const { return std::type_index(typeid(T)); }
+    virtual std::type_index type() const { return type_; }
+    virtual void set_type(std::type_index type) { type_ = type; }
 
     /*! the pointer of memory block. */
-    std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
+    std::unique_ptr<uint8_t, memory::PODDeleter<uint8_t, Place>> ptr_;
 
     /*! the place of memory block. */
     platform::Place place_;
 
     /*! the size of memory block. */
     size_t size_;
+
+    /* the current type of memory */
+    std::type_index type_;
   };
 
   /*! holds the memory block if allocated. */
   std::shared_ptr<Placeholder> holder_;
 
-  /*! points to dimensions of memory block. */
+  /**
+   * @brief points to elements dimensions.
+   *
+   * @note dims_ do not indicate the memory block size.
+   */
+
   DDim dims_;
 
   /**
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4c82c3638351c41df26503e2a26b5a4bb5822a67
--- /dev/null
+++ b/paddle/framework/tensor_array.cc
@@ -0,0 +1,292 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+
+
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_array.h"
+
+#include <glog/logging.h>
+#include <algorithm>
+#include <limits>
+
+namespace paddle {
+namespace framework {
+
+namespace detail {
+
+/*
+ * Offer an iterator over the length-sorted lod-tensor's top level. The top
+ * level of a lod-tensor stores batch-size of sequences, each top-level sequence
+ * may contains several lower-level sequences, sort top-level lod by the numbers
+ * of lower-level sequences in descending order, so that during RNN's running,
+ * the batch-size will keep decreasing, the short sentences will end at the tail
+ * of each batch.
+ *
+ * Let's take a simple lod-tensor for example
+ *
+ *   |(0)       |(1)        top-level has two instances
+ *   |||        |||||    lower-level
+ *
+ * sort by lower-level's length
+ *
+ *   |(1)       |(0)
+ *   |||||      |||
+ *
+ * when RNN runs, it get 5 batches (equals the number of elements the longest
+ * sequence has)
+ *
+ * |||||
+ * |||
+ *
+ * the first three batches has two elements, the last two elements just has 1
+ * element each.
+ */
+struct DynamicBatchUnpacker {
+  using value_type = float;
+
+  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
+                       bool descend = true)
+      : source(&source), level(level) {
+    BuildLengthSortedMeta(descend);
+  }
+
+  LoDTensor GetBatch(size_t index);
+
+  std::vector<DySeqMeta> meta;
+
+  LoDTensor const* source;
+  size_t level;
+
+ protected:
+  void BuildLengthSortedMeta(bool descend);
+};
+
+LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
+                           const std::vector<DySeqMeta>& meta, const LoD& lod,
+                           size_t level);
+
+std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
+  // collect indice need to copy to the batch
+  std::vector<size_t> indice;
+  for (const auto& seq : meta) {
+    size_t id = seq.begin + batch_id;
+    if (id >= seq.end) break;
+    indice.push_back(id);
+  }
+  return indice;
+}
+
+}  // namespace detail
+
+const LoDTensor& TensorArray::Read(size_t index) const {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+  return values_[index];
+}
+
+void TensorArray::Write(size_t index, const LoDTensor& value) {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+
+  values_[index].Resize(value.dims());
+  values_[index].mutable_data<value_type>(platform::CPUPlace());
+  values_[index].CopyFrom(value, platform::CPUPlace(),
+                          platform::CPUDeviceContext());
+}
+
+void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
+  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
+  if (index >= size()) {
+    values_.resize(index + 1);
+  }
+
+  values_[index].ShareDataWith(value);
+}
+
+LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
+                            const LoD& lod) const {
+  return detail::PackDynamicBatch(values_, meta, lod, level);
+}
+
+DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
+                                   bool length_desend) {
+  detail::DynamicBatchUnpacker unpacker(source, level,
+                                        length_desend /*descend*/);
+
+  // find max length of all the sequences
+  size_t max_length = 0;
+  for (const auto& seq : unpacker.meta) {
+    max_length = std::max(max_length, seq.end - seq.begin);
+  }
+
+  // write batches to values
+  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
+    Write(batch_id, unpacker.GetBatch(batch_id));
+  }
+
+  PADDLE_ENFORCE(!unpacker.meta.empty());
+  return unpacker.meta;
+}
+
+LoDTensor TensorArray::Stack() const {
+  LoDTensor result;
+  if (size() == 0) return result;
+
+  const auto& first_dims = values_.front().dims();
+  // check all the values have the same shape
+  // TODO(superjom) check the same dtypes
+  for (size_t idx = 1; idx < size(); idx++) {
+    const auto& value_dims = values_[idx].dims();
+    PADDLE_ENFORCE_EQ(first_dims, value_dims);
+  }
+
+  // copy
+  auto result_dims = vectorize(first_dims);
+  result_dims.insert(result_dims.begin(), size());
+  result.Resize(make_ddim(result_dims));
+  result.mutable_data<value_type>(platform::CPUPlace());
+
+  for (size_t idx = 0; idx < size(); idx++) {
+    result.Slice(idx, idx + 1)
+        .CopyFrom(Read(idx), platform::CPUPlace(),
+                  platform::CPUDeviceContext());
+  }
+  return result;
+}
+
+void TensorArray::Unstack(const LoDTensor& source) const {
+  Unstack(source, false /*data_shared*/);
+}
+
+void TensorArray::UnstackShared(const LoDTensor& source) const {
+  Unstack(source, true /*data_shared*/);
+}
+
+void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
+  size_t first_dim = source.dims()[0];
+  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
+  PADDLE_ENFORCE_GT(first_dim, 0,
+                    "source should have some data to be unstacked");
+
+  values_.resize(first_dim);
+
+  for (size_t elem = 0; elem < first_dim; elem++) {
+    // create a new value
+    auto& value = values_[elem];
+    if (data_shared) {
+      // share memory
+      value.ShareDataWith(source.Slice(elem, elem + 1));
+    } else {
+      // copy
+      value.Resize(value_dims);
+      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
+                     platform::CPUDeviceContext());
+    }
+  }
+}
+
+size_t TensorArray::size() const { return values_.size(); }
+
+namespace detail {
+
+void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
+  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
+  // collect meta for each sequence in some level
+  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
+
+  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
+    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
+    meta.push_back(seq_meta);
+  }
+
+  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
+
+  // sort by length
+  sort(meta.begin(), meta.end(),
+       [descend](const DySeqMeta& a, const DySeqMeta& b) {
+         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
+         return descend ? a_ge_b : !a_ge_b;
+       });
+}
+
+LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
+  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
+  LoDTensor result;
+
+  auto indice = detail::GenDyBatchIndice(meta, index);
+  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
+
+  // copy the indice of records in LoDTensor
+  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
+  auto record_dims_vec = vectorize(record_dims);
+  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
+  result.Resize(make_ddim(record_dims_vec));
+  result.mutable_data<value_type>(platform::CPUPlace());
+
+  for (size_t i = 0; i < indice.size(); i++) {
+    auto index = indice[i];
+    auto target = result.Slice(i, i + 1);
+    auto slice = source->Slice(index, index + 1);
+
+    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
+  }
+
+  return result;
+}
+
+// TODO(supejom) to cache lod if reasonable
+LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
+                           const std::vector<DySeqMeta>& meta, const LoD& lod,
+                           size_t level) {
+  PADDLE_ENFORCE(!source.empty());
+  PADDLE_ENFORCE(!meta.empty());
+  PADDLE_ENFORCE(!lod.empty());
+
+  LoDTensor result;
+
+  // init result space
+  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
+  auto record_dims_vec = vectorize(record_dims);
+  auto height = lod[level].back();
+  record_dims_vec.insert(record_dims_vec.begin(), height);
+  result.Resize(make_ddim(record_dims_vec));
+  result.mutable_data<float>(platform::CPUPlace());
+
+  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
+    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
+      const auto& seq_meta = meta[seq_id];
+      // source is source[batch_id][seq_id]
+      // target is result[index]
+      auto index = seq_meta.begin + batch_id;
+      if (index >= seq_meta.end) break;
+      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
+      auto target = result.Slice(index, index + 1);
+      target.CopyFrom(source_, platform::CPUPlace(),
+                      platform::CPUDeviceContext());
+    }
+  }
+
+  result.set_lod(lod);
+  return result;
+}
+
+}  // namespace detail
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
new file mode 100644
index 0000000000000000000000000000000000000000..046ecb5221b7ed9d88e5017348ee8fcde23c7677
--- /dev/null
+++ b/paddle/framework/tensor_array.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <vector>
+
+#include "paddle/framework/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
+ * after lod-tensor's re-assembling, its info can be used to recover the order
+ * in original lod-tensor.
+ */
+struct DySeqMeta {
+  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
+      : begin(begin), end(end), ori_idx(ori_idx) {}
+
+  size_t begin;
+  size_t end;  // not included
+  size_t ori_idx;
+};
+
+using DySeqMetaBatch = std::vector<DySeqMeta>;
+
+/*
+ * Extract the indices of instances.
+ */
+std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
+
+/*
+ * TensorArray is a C-array-like array of tensors, it is meant to be used with
+ * dynamic iteration primitives such as while_loop. It is used to segment inputs
+ * and store states in all time steps.
+ *
+ * By providing some methods similar to a C++ array, the difinition of some
+ * state-based dynamic models such as RNN cound be more natural and highly
+ * flexible.
+ */
+class TensorArray {
+ public:
+  using value_type = float;
+
+  // max number of values allowed to store.
+  const size_t MAX_SIZE{100000};
+
+  /*
+   * Read the value at location `index` in the `TensorArray`.
+   */
+  const LoDTensor &Read(size_t index) const;
+
+  /*
+   * Write value into the index of the TensorArray.
+   */
+  void Write(size_t index, const LoDTensor &value);
+
+  /*
+   * Write value into the index of the TensorArray, with memory shared.
+   */
+  void WriteShared(size_t index, const LoDTensor &value);
+
+  /*
+   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
+   * `indice_map`.
+   */
+  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
+                 const LoD &lod) const;
+
+  /*
+   * Split LoDTensor in some `level` and write the generated batches to
+   * `values`, if set `desend`, will sort by length in descending order else in
+   * ascending order.
+   */
+  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
+
+  /*
+   * Pack the values into a tensor with rank one higher than each tensor in
+   * values.
+   */
+  LoDTensor Stack() const;
+
+  /*
+   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
+   */
+  void Unstack(const LoDTensor &source) const;
+
+  /*
+   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
+   * with memory of tensors shared.
+   */
+  void UnstackShared(const LoDTensor &source) const;
+
+  /*
+   * Return the number of values.
+   */
+  size_t size() const;
+
+ protected:
+  void Unstack(const LoDTensor &source, bool data_shared) const;
+
+ private:
+  mutable std::vector<LoDTensor> values_;
+};  // class TensorArray
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9470ac5e6ed714d5ba63f3743e683af7f8edd4b0
--- /dev/null
+++ b/paddle/framework/tensor_array_test.cc
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor_array.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+class TensorArrayTester : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    LoDTensor source;
+    source.Resize(make_ddim({batch_size, dim}));
+    int* data = source.mutable_data<int>(platform::CPUPlace());
+    for (int i = 0; i < 16 * 32; i++) {
+      data[i] = i;
+    }
+    ta.Unstack(source);
+  }
+
+  TensorArray ta;
+  const int batch_size = 16;
+  const int dim = 32;
+};
+
+TEST_F(TensorArrayTester, Read) {
+  for (int i = 0; i < batch_size; i++) {
+    const auto& tensor = ta.Read(i);
+    ASSERT_EQ(tensor.dims()[0], 1);
+    ASSERT_EQ(tensor.dims()[1], dim);
+  }
+}
+
+TEST_F(TensorArrayTester, Write) {
+  LoDTensor source;
+  source.Resize(make_ddim({1, dim}));
+  for (int i = 0; i < dim; i++) {
+    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
+  }
+
+  ta.Write(2, source);
+
+  const auto& tensor = ta.Read(2);
+  for (int i = 0; i < dim; i++) {
+    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
+  }
+}
+
+TEST_F(TensorArrayTester, WriteShared) {
+  LoDTensor source;
+  source.Resize(make_ddim({1, dim}));
+  for (int i = 0; i < dim; i++) {
+    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
+  }
+
+  ta.WriteShared(2, source);
+
+  const auto& tensor = ta.Read(2);
+  for (int i = 0; i < dim; i++) {
+    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
+  }
+
+  EXPECT_EQ(source.data<int>(), tensor.data<int>());
+}
+
+class TensorArrayPackTester : public ::testing::Test {
+ protected:
+  virtual void SetUp() override {
+    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
+
+    source.set_lod(lod);
+    source.Resize(make_ddim({13, 128}));
+    source.mutable_data<int>(platform::CPUPlace());
+
+    // content of each setence: 0 1 2 3 4
+    const auto& level = lod.front();
+    for (size_t i = 0; i < level.size() - 1; i++) {
+      size_t begin = level[i];
+      size_t end = level[i + 1];
+      for (size_t j = begin; j < end; j++) {
+        auto record = source.Slice(j, j + 1);
+        for (int dim = 0; dim < 128; dim++) {
+          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
+        }
+      }
+    }
+
+    // unpack
+    meta = ta.Unpack(source, 0, true);
+  }
+
+  LoD lod;
+  TensorArray ta;
+  LoDTensor source;
+  std::vector<DySeqMeta> meta;
+};
+
+TEST_F(TensorArrayPackTester, Unpack) {
+  ASSERT_EQ(ta.size(), 7UL);
+
+  const auto& t0 = ta.Read(0);
+  const auto& t1 = ta.Read(1);
+
+  ASSERT_EQ(t0.data<int>()[0], int(0));
+  ASSERT_EQ(t1.data<int>()[0], int(1));
+}
+
+TEST_F(TensorArrayPackTester, Pack) {
+  LoDTensor packed = ta.Pack(0, meta, lod);
+}
+
+TEST_F(TensorArrayTester, size) {
+  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index a5405f9c31543b5733f9db923c2a6f8b968cfc2d..f6e801bbb4a056b5590da95a4b140cb90638f322 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -19,12 +19,50 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename... T>
+struct SizeOfTypeFunctor;
+
 template <typename T>
+struct SizeOfTypeFunctor<T> {
+  size_t operator()(std::type_index type) const {
+    if (typeid(T).hash_code() == type.hash_code()) {
+      return sizeof(T);
+    } else {
+      return 0UL;
+    }
+  }
+};
+
+template <>
+struct SizeOfTypeFunctor<> {
+  size_t operator()(std::type_index type) const { return 0UL; }
+};
+
+template <typename HEAD, typename... TAIL>
+struct SizeOfTypeFunctor<HEAD, TAIL...> {
+  size_t operator()(std::type_index type) const {
+    SizeOfTypeFunctor<HEAD> head;
+    size_t head_size = head(type);
+    if (head_size != 0) {
+      return head_size;
+    }
+    SizeOfTypeFunctor<TAIL...> tail;
+    return tail(type);
+  }
+};
+
+static inline size_t SizeOfType(std::type_index type) {
+  SizeOfTypeFunctor<int, float, double, int16_t, int64_t> functor;
+  size_t size = functor(type);
+  PADDLE_ENFORCE(size != 0UL, "Cannot get size of type %s", type.name());
+  return size;
+}
+
 inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tensor holds no memory. Call Tensor::mutable_data first.");
   PADDLE_ENFORCE_GE(
-      holder_->size(), numel() * sizeof(T) + offset_,
+      holder_->size(), numel() * SizeOfType(type()) + offset_,
       "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
       "first to re-allocate memory.\n"
       "or maybe the required data-type mismatches the data already stored.");
@@ -32,14 +70,23 @@ inline void Tensor::check_memory_size() const {
 
 template <typename T>
 inline const T* Tensor::data() const {
-  check_memory_size<T>();
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
+
   return reinterpret_cast<const T*>(
       reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
 }
 
 template <typename T>
 inline T* Tensor::data() {
-  check_memory_size<T>();
+  check_memory_size();
+  PADDLE_ENFORCE(std::is_same<T, void>::value ||
+                     holder_->type().hash_code() == typeid(T).hash_code(),
+                 "Tensor holds the wrong type, it holds %s",
+                 this->holder_->type().name());
   return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                               offset_);
 }
@@ -54,78 +101,130 @@ inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
 template <typename T>
 inline T* Tensor::mutable_data(platform::Place place) {
   static_assert(std::is_pod<T>::value, "T must be POD");
+  return reinterpret_cast<T*>(mutable_data(place, typeid(T)));
+}
+
+inline void* Tensor::mutable_data(platform::Place place, std::type_index type) {
+  if (holder_ != nullptr) {
+    holder_->set_type(type);
+  }
   PADDLE_ENFORCE_GT(numel(), 0,
                     "Tensor's numel must be larger than zero to call "
                     "Tensor::mutable_data. Call Tensor::set_dim first.");
+  int64_t size = numel() * SizeOfType(type);
   /* some versions of boost::variant don't have operator!= */
-  int64_t size = numel() * sizeof(T);
   if (holder_ == nullptr || !(holder_->place() == place) ||
       holder_->size() < size + offset_) {
     if (platform::is_cpu_place(place)) {
-      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-          boost::get<platform::CPUPlace>(place), size));
+      holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size, type));
     } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
     }
 #else
-      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-          boost::get<platform::GPUPlace>(place), size));
+      holder_.reset(new PlaceholderImpl<platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size, type));
     }
 #endif
     offset_ = 0;
   }
-  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                              offset_);
+  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                 offset_);
+}
+
+inline void* Tensor::mutable_data(platform::Place place) {
+  PADDLE_ENFORCE(this->holder_ != nullptr,
+                 "Cannot invoke mutable data if current hold nothing");
+  return mutable_data(place, holder_->type());
 }
 
-template <typename T>
 inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
-  src.check_memory_size<T>();
+  src.check_memory_size();
   *this = src;
   return *this;
 }
 
-template <typename T>
 inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place) {
-  src.check_memory_size<T>();
+                             const platform::Place& dst_place,
+                             const platform::DeviceContext& ctx) {
+  src.check_memory_size();
   Resize(src.dims());
 
   auto src_place = src.holder_->place();
-  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto src_ptr = src.data<void>();
 
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto dst_ptr = mutable_data(dst_place, src.type());
 
-  auto size = src.numel() * sizeof(T);
+  auto size = src.numel() * SizeOfType(src.type());
 
   if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
     memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
                  boost::get<platform::CPUPlace>(src_place), src_ptr, size);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   else if (platform::is_gpu_place(src_place) &&
            platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   } else if (platform::is_cpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   } else if (platform::is_gpu_place(src_place) &&
              platform::is_gpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
   }
-  PADDLE_ENFORCE(cudaStreamSynchronize(0),
-                 "cudaStreamSynchronize failed in Tensor CopyFrom");
-
 #endif
 }
 
 template <typename T>
+inline void Tensor::CopyFromVector(const std::vector<T>& src,
+                                   const platform::DeviceContext& ctx) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {
+    memory::Copy(
+        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
 inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
-  check_memory_size<T>();
+  check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0, "Slice begin index is less than zero.");
   PADDLE_ENFORCE_LE(end_idx, dims_[0], "Slice end index is out of bound.");
   PADDLE_ENFORCE_LT(begin_idx, end_idx,
@@ -140,7 +239,7 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
     DDim dst_dims = dims_;
     dst_dims[0] = end_idx - begin_idx;
     dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+    dst.offset_ = offset_ + begin_idx * base * SizeOfType(type());
     return dst;
   }
 }
@@ -154,10 +253,9 @@ inline const DDim& Tensor::dims() const { return dims_; }
 
 inline int64_t Tensor::numel() const { return product(dims_); }
 
-template <typename T>
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
   Tensor res;
-  res.ShareDataWith<T>(src);
+  res.ShareDataWith(src);
   res.Resize(flatten_to_2d(src.dims(), num_col_dims));
   return res;
 }
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index e2ec738de35c90c6a06c9a46b062d4cce55f5eda..1bb0fb71b079940d35a995b78e04a531c074a8b2 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -74,7 +74,7 @@ TEST(Tensor, MutableData) {
     EXPECT_EQ(p1, p2);
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -108,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
     // Try to share data form uninitialized tensor
     bool caught = false;
     try {
-      dst_tensor.ShareDataWith<float>(src_tensor);
+      dst_tensor.ShareDataWith(src_tensor);
     } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
       std::string msg =
@@ -122,16 +122,16 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_TRUE(caught);
 
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataWith<int>(src_tensor);
+    dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     Tensor dst_tensor;
     src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataWith<int>(src_tensor);
+    dst_tensor.ShareDataWith(src_tensor);
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 #endif
@@ -143,7 +143,7 @@ TEST(Tensor, Slice) {
   {
     Tensor src_tensor;
     src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
+    Tensor slice_tensor = src_tensor.Slice(1, 3);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 3);
     EXPECT_EQ(slice_dims[0], 2);
@@ -163,11 +163,11 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    Tensor slice_tensor = src_tensor.Slice(2, 6);
     DDim slice_dims = slice_tensor.dims();
     ASSERT_EQ(arity(slice_dims), 2);
     EXPECT_EQ(slice_dims[0], 4);
@@ -194,6 +194,7 @@ TEST(Tensor, CopyFrom) {
   {
     Tensor src_tensor;
     Tensor dst_tensor;
+    CPUDeviceContext cpu_ctx((CPUPlace()));
 
     int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
 
@@ -201,7 +202,7 @@ TEST(Tensor, CopyFrom) {
     memcpy(src_ptr, arr, 9 * sizeof(int));
 
     auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
+    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
 
     const int* dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(src_ptr, dst_ptr);
@@ -209,8 +210,8 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
     const int* slice_ptr = slice_tensor.data<int>();
     dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(dst_ptr, slice_ptr);
@@ -218,7 +219,7 @@ TEST(Tensor, CopyFrom) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   {
     Tensor src_tensor;
     Tensor gpu_tensor;
@@ -231,28 +232,31 @@ TEST(Tensor, CopyFrom) {
 
     // CPU Tensor to GPU Tensor
     auto gpu_place = new paddle::platform::GPUPlace(0);
-    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
 
     // GPU Tensor to CPU Tensor
     auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
 
-    // Compare Tensors
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
     const int* dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(src_ptr, dst_ptr);
     for (size_t i = 0; i < 9; ++i) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
 
-    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
 
     // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
+    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
 
     // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
 
-    // Compare Slice Tensors
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
     const int* slice_ptr = slice_tensor.data<int>();
     dst_ptr = dst_tensor.data<int>();
     ASSERT_NE(dst_ptr, slice_ptr);
@@ -263,6 +267,99 @@ TEST(Tensor, CopyFrom) {
 #endif
 }
 
+TEST(Tensor, CopyFromVector) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::GPUPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
+    // Copy from GPU to CPU tensor for comparison
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
+    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
 TEST(Tensor, ReshapeToMatrix) {
   using namespace paddle::framework;
   using namespace paddle::platform;
@@ -271,7 +368,7 @@ TEST(Tensor, ReshapeToMatrix) {
   for (int i = 0; i < 2 * 3 * 4 * 9; ++i) {
     src_ptr[i] = i;
   }
-  Tensor res = ReshapeToMatrix<int>(src, 2);
+  Tensor res = ReshapeToMatrix(src, 2);
   ASSERT_EQ(res.dims()[0], 2 * 3);
   ASSERT_EQ(res.dims()[1], 4 * 9);
 }
diff --git a/paddle/framework/type_defs.h b/paddle/framework/type_defs.h
new file mode 100644
index 0000000000000000000000000000000000000000..00da7289394cf18e013220a4bedde2c182f6a4a4
--- /dev/null
+++ b/paddle/framework/type_defs.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/platform/variant.h"
+
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class OpDescBind;
+class BlockDescBind;
+class BlockDesc;
+using VariableNameMap = std::map<std::string, std::vector<std::string>>;
+
+// The order should be as same as framework.proto
+using Attribute =
+    boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                   std::vector<float>, std::vector<std::string>, bool,
+                   std::vector<bool>, BlockDesc*>;
+
+using AttributeMap = std::unordered_map<std::string, Attribute>;
+
+using OpCreator = std::function<OperatorBase*(
+    const std::string& /*type*/, const VariableNameMap& /*inputs*/,
+    const VariableNameMap& /*outputs*/, const AttributeMap& /*attrs*/)>;
+
+using GradOpMakerFN = std::function<std::vector<std::unique_ptr<OpDescBind>>(
+    const OpDescBind&, const std::unordered_set<std::string>& /*no_grad_set*/,
+    std::unordered_map<std::string, std::string>* /*grad_to_var*/)>;
+
+using InferVarTypeFN = std::function<void(const OpDescBind& /*op_desc*/,
+                                          BlockDescBind* /*block*/)>;
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 13b9c5f3cdf98e6d22f4217fa1cf9a48910a78d8..8e92c81d1137472737230be79d71824593d3256f 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -13,24 +13,62 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/framework/var_desc.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
+VarDesc::VarType VarDescBind::GetType() const { return desc_.type(); }
+
+void VarDescBind::SetType(VarDesc::VarType type) { desc_.set_type(type); }
+
 void VarDescBind::SetShape(const std::vector<int64_t> &dims) {
-  VectorToRepeated(dims, desc_.mutable_lod_tensor()->mutable_dims());
+  VectorToRepeated(dims, mutable_tensor_desc()->mutable_dims());
 }
 
 void VarDescBind::SetDataType(DataType data_type) {
-  desc_.mutable_lod_tensor()->set_data_type(data_type);
+  mutable_tensor_desc()->set_data_type(data_type);
 }
 
 std::vector<int64_t> VarDescBind::Shape() const {
-  return RepeatedToVector(desc_.lod_tensor().dims());
+  return RepeatedToVector(tensor_desc().dims());
+}
+
+DataType VarDescBind::GetDataType() const { return tensor_desc().data_type(); }
+
+void VarDescBind::SetLoDLevel(int32_t lod_level) {
+  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  desc_.mutable_lod_tensor()->set_lod_level(lod_level);
+}
+
+int32_t VarDescBind::GetLodLevel() const {
+  PADDLE_ENFORCE(desc_.type() == VarDesc::LOD_TENSOR);
+  return desc_.lod_tensor().lod_level();
+}
+
+const TensorDesc &VarDescBind::tensor_desc() const {
+  PADDLE_ENFORCE(desc_.has_type(), "invoke TensorDesc must after set type");
+  switch (desc_.type()) {
+    case VarDesc::SELECTED_ROWS:
+      return desc_.selected_rows();
+    case VarDesc::LOD_TENSOR:
+      return desc_.lod_tensor().tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
 }
 
-DataType VarDescBind::GetDataType() const {
-  return desc_.lod_tensor().data_type();
+TensorDesc *VarDescBind::mutable_tensor_desc() {
+  PADDLE_ENFORCE(desc_.has_type(),
+                 "invoke MutableTensorDesc must after set type");
+  switch (desc_.type()) {
+    case VarDesc::SELECTED_ROWS:
+      return desc_.mutable_selected_rows();
+    case VarDesc::LOD_TENSOR:
+      return desc_.mutable_lod_tensor()->mutable_tensor();
+    default:
+      PADDLE_THROW("Unexpected branch.");
+  }
 }
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 4763bf09d004539ab24e4aad3bf429667f1fcc73..929de1f836fa906966ff125c70380d85d062afdf 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -34,6 +34,7 @@ inline std::vector<T> RepeatedToVector(
 template <typename T, typename RepeatedField>
 inline void VectorToRepeated(const std::vector<T> &vec,
                              RepeatedField *repeated_field) {
+  repeated_field->Clear();
   repeated_field->Reserve(vec.size());
   for (const auto &elem : vec) {
     *repeated_field->Add() = elem;
@@ -44,6 +45,7 @@ inline void VectorToRepeated(const std::vector<T> &vec,
 template <typename RepeatedField>
 inline void VectorToRepeated(const std::vector<bool> &vec,
                              RepeatedField *repeated_field) {
+  repeated_field->Clear();
   repeated_field->Reserve(vec.size());
   for (auto elem : vec) {
     *repeated_field->Add() = elem;
@@ -52,7 +54,10 @@ inline void VectorToRepeated(const std::vector<bool> &vec,
 
 class VarDescBind {
  public:
-  explicit VarDescBind(const std::string &name) { desc_.set_name(name); }
+  explicit VarDescBind(const std::string &name) {
+    desc_.set_name(name);
+    desc_.set_type(VarDesc::LOD_TENSOR);
+  }
 
   VarDesc *Proto() { return &desc_; }
 
@@ -66,7 +71,22 @@ class VarDescBind {
 
   DataType GetDataType() const;
 
+  void SetLoDLevel(int32_t lod_level);
+
+  int32_t GetLodLevel() const;
+
+  VarDesc::VarType GetType() const;
+
+  void SetType(VarDesc::VarType type);
+
+  bool Persistable() const { return desc_.persistable(); }
+
+  void SetPersistable(bool persistable) { desc_.set_persistable(persistable); }
+
  private:
+  const TensorDesc &tensor_desc() const;
+  TensorDesc *mutable_tensor_desc();
+
   VarDesc desc_;
 };
 }  // namespace framework
diff --git a/paddle/framework/var_type_inference.h b/paddle/framework/var_type_inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..32abbeb33479444c5e7a9889f4211f59af07f98f
--- /dev/null
+++ b/paddle/framework/var_type_inference.h
@@ -0,0 +1,29 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/type_defs.h"
+
+namespace paddle {
+namespace framework {
+
+class VarTypeInference {
+ public:
+  virtual ~VarTypeInference() {}
+  virtual void operator()(const OpDescBind& op_desc,
+                          BlockDescBind* block) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/var_type_inference_test.cc b/paddle/framework/var_type_inference_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..918de1fd055e32888f71ffea1f33993ba1210e86
--- /dev/null
+++ b/paddle/framework/var_type_inference_test.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/var_type_inference.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+namespace framework {
+
+class SumOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SumOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class SumOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDescBind &op_desc,
+                  BlockDescBind *block) const override {
+    auto &inputs = op_desc.Input("X");
+    auto default_var_type = VarDesc::SELECTED_ROWS;
+
+    bool any_input_is_lod_tensor = std::any_of(
+        inputs.begin(), inputs.end(), [block](const std::string &name) {
+          return block->Var(name)->GetType() == VarDesc::LOD_TENSOR;
+        });
+    if (any_input_is_lod_tensor) {
+      default_var_type = VarDesc::LOD_TENSOR;
+    }
+
+    auto out_var_name = op_desc.Output("Out").front();
+    block->Var(out_var_name)->SetType(default_var_type);
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
+                  paddle::framework::SumOpMaker);
+
+namespace paddle {
+namespace framework {
+
+TEST(InferVarType, sum_op) {
+  ProgramDescBind prog;
+  auto *op = prog.Block(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"test_a", "test_b", "test_c"});
+  op->SetOutput("Out", {"test_out"});
+
+  prog.Block(0)->Var("test_a")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test_b")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test_out");
+
+  op->InferVarType(prog.Block(0));
+
+  ASSERT_EQ(VarDesc::SELECTED_ROWS, prog.Block(0)->Var("test_out")->GetType());
+
+  prog.Block(0)->Var("test_b")->SetType(VarDesc::LOD_TENSOR);
+  op->InferVarType(prog.Block(0));
+  ASSERT_EQ(VarDesc::LOD_TENSOR, prog.Block(0)->Var("test_out")->GetType());
+}
+
+TEST(InferVarType, sum_op_without_infer_var_type) {
+  ProgramDescBind prog;
+  auto *op = prog.Block(0)->AppendOp();
+  op->SetType("sum_without_infer_var_type");
+  op->SetInput("X", {"test2_a", "test2_b", "test2_c"});
+  op->SetOutput("Out", {"test2_out"});
+
+  prog.Block(0)->Var("test2_a")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test2_b")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test2_c")->SetType(VarDesc::SELECTED_ROWS);
+  prog.Block(0)->Var("test2_out");
+
+  op->InferVarType(prog.Block(0));
+
+  ASSERT_EQ(VarDesc_VarType_LOD_TENSOR,
+            prog.Block(0)->Var("test2_out")->GetType());
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index 38fc2720a3023039aa113b32a394bda9c5def4c0..a80f0e66b5a59bf95efc200d159ad5dd9cf4111a 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -25,7 +25,10 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
-    PADDLE_ENFORCE(IsType<T>(), "Variable must be type %s", typeid(T).name());
+    PADDLE_ENFORCE(holder_ != nullptr, "Variable must hold some thing");
+    PADDLE_ENFORCE(IsType<T>(),
+                   "Variable must be type %s, the holding type is %s",
+                   typeid(T).name(), holder_->Type().name());
     return *static_cast<const T*>(holder_->Ptr());
   }
 
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
index a89b6bba45843d81264819cad6ba053f28314f6b..bd0fe119ce46df9c333258c9c1ad7b5b2bdc544f 100644
--- a/paddle/function/BlockExpandOp.cpp
+++ b/paddle/function/BlockExpandOp.cpp
@@ -194,7 +194,7 @@ public:
 
 REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
 REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
 #endif
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index b87750b74247bd0eb822340bc5a85d41b86ecec2..23916c0f4b6319004ca0793bc9305b8a1dd0ae89 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -395,7 +395,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     CPU,
                     ContextProjectionBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     GPU,
                     ContextProjectionForwardFunc);
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
index 7ece7b2dfedaf460741c97b5a700eb632d85cabc..2e5c281f37d8ffb1062121b5dc5b4f790ab52089 100644
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
@@ -233,7 +233,7 @@ private:
 
 REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
 REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
 #endif
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
index f12ee43e3d72f9ac776eaff93914228850694dd2..46f98f12c1f150fdf3ed53a7a96e5cf0020e14a4 100644
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
@@ -169,7 +169,7 @@ private:
 
 REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
 REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
 #endif
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index ef878bfbba961bdd3d5212e19fb83bb1e285e47f..9e88669d37bd50179dcc0464e8c1cd6e2fed74db 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -336,7 +336,7 @@ private:
 
 REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
 REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
 #endif
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
index 2f3112fe657cd381891dc53c7179e7520911e8c9..9863e3ae1d5fcb1eece5267fd4f2a6b593b799df 100644
--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -292,7 +292,7 @@ REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
 REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
                     CPU,
                     DepthwiseConvGradFilterFunction);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
 REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
                     GPU,
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
index d8e8c889d5c23bf9b2b5fd0b0393395883188fd8..b1a90da7db2b647dd384e3772820294140e5ec9d 100644
--- a/paddle/function/DepthwiseConvOpTest.cpp
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 namespace paddle {
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(DepthwiseConv, Forward) {
   DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "DepthwiseConv-GPU", forward);
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index f8cf4ebea8d724f0291b981647622b63e3d84495..bdb56ddac38b91d756fc6f31282f29c0489fd660 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -340,7 +340,7 @@ public:
 REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
 REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
 REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
index 5283d79a5a53d979ae4e134f7e46b7ee106e9c44..b5b5e1f35b79e422b14f7495bc321533cc1d618a 100644
--- a/paddle/function/GemmConvOpTest.cpp
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -24,7 +24,7 @@ TEST(GemmConv, NaiveConv) {
       "NaiveConv-CPU", "GemmConv-CPU", forward);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(GemmConv, Forward) {
   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
       "GemmConv-CPU", "GemmConv-GPU", forward);
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
index acc88a553abe7ac58b629aba9b850df58cee7f81..a0a01a5fc7fc055dce6ddb3ee51c7ab18f8a4ca7 100644
--- a/paddle/function/Im2ColTest.cpp
+++ b/paddle/function/Im2ColTest.cpp
@@ -116,7 +116,7 @@ void TestIm2ColFunctor() {
 
 TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
 
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
index 25e41edad54bec0f76a3de4799fab14241407272..704a8c41325ef86067a3bd8ed6d772b77df147c5 100644
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
@@ -341,7 +341,7 @@ private:
 };
 
 REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
 #endif
 }  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index adba7c92ece505eecc74edce6b393cf27fa10ccc..eed2f2e3089b6b6167ef7c5a7acb7ecaa08945e1 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -207,7 +207,7 @@ private:
 
 REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
 REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
 #endif
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
index b6501e8f4db7fd33891cd80e07a6f36dd0b34532..7c802d66273c6f7aa56b2f460e3dff4401967517 100644
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -217,7 +217,7 @@ public:
 
 REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
 REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
 #endif
diff --git a/paddle/function/SwitchOp.cpp b/paddle/function/SwitchOp.cpp
index 01e252a8dc0cd5fa1e964efa01d04cf282b3dfe7..597723a2dded6a6a116e05b7d4c942cd633e2c99 100644
--- a/paddle/function/SwitchOp.cpp
+++ b/paddle/function/SwitchOp.cpp
@@ -132,7 +132,7 @@ public:
 
 REGISTER_TYPED_FUNC(NCHW2NHWC, CPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, CPU, NHWC2NCHWFunc);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 REGISTER_TYPED_FUNC(NCHW2NHWC, GPU, NCHW2NHWCFunc);
 REGISTER_TYPED_FUNC(NHWC2NCHW, GPU, NHWC2NCHWFunc);
 #endif
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 62cff9361ccba3ae3b9359ddb932f5b26146eb97..5f39167afc34affbea7858fa0794ef52b786a383 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -60,6 +60,36 @@ if(NOT WITH_PYTHON)
             dataproviders/PyDataProvider.h)
 endif()
 
+if(MOBILE_INFERENCE)
+    # Remove evaluators
+    list(REMOVE_ITEM GSERVER_SOURCES
+         layers/ValidationLayer.cpp
+         evaluators/Evaluator.cpp
+         evaluators/DetectionMAPEvaluator.cpp
+         evaluators/CTCErrorEvaluator.cpp
+         evaluators/ChunkEvaluator.cpp)
+
+    # Remove dataproviders
+    list(REMOVE_ITEM GSERVER_SOURCES
+         dataproviders/DataProvider.cpp
+         dataproviders/MultiDataProvider.cpp
+         dataproviders/ProtoDataProvider.cpp
+         dataproviders/PyDataProvider2.cpp
+         dataproviders/PyDataProvider.cpp)
+
+    # Remove useless gradientmachines
+    list(REMOVE_ITEM GSERVER_SOURCES
+         gradientmachines/MultiNetwork.cpp
+         gradientmachines/RecurrentGradientMachine.cpp
+         gradientmachines/ParallelNeuralNetwork.cpp
+         gradientmachines/GradientMachineMode.cpp
+         gradientmachines/MultiGradientMachine.cpp)
+
+    # Remove useless layers
+    list(REMOVE_ITEM GSERVER_SOURCES
+    	 layers/RecurrentLayerGroup.cpp)
+endif()
+
 if(WITH_GPU)
     cuda_add_library(paddle_gserver ${GSERVER_SOURCES})
 else()
diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 18c5638100065109fba1f0647a1c5f91256f7b9d..f3ccd68160859795f28a40f8d0d4032adb289ccf 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -126,7 +126,7 @@ void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
   copyInVal_ = nullptr;
   if (act.grad && algo == algorithm::eltwise_tanh) {
     // tanh need save src input for backward
-    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    inVal_ = MKLDNNMatrix::create(val_->getPrimitiveDesc());
     copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
     CHECK(copyInVal_) << "should not be emptry";
     pipelineFwd_.push_back(*copyInVal_);
@@ -145,7 +145,7 @@ void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
   algorithm algo = getAlgo(this->getName());
   float alpha = getBwdAlpha();
   float beta = getBeta();
-  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  grad_ = MKLDNNMatrix::create(val_->getPrimitiveDesc(), act.grad);
   auto eng = CPUEngine::Instance().getEngine();
   auto bwdDesc = eltwise_bwd::desc(
       algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
@@ -230,7 +230,7 @@ void MKLDNNActivation::resetFwd(Argument& act) {
     int ic = cnt_ / bs / ih / iw;
     CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
     val_ = MKLDNNMatrix::create(
-        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+        {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_, act.value);
     CHECK(val_);
     val_->downSpatial();
   }
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index b44e4dc202f01956ed21c175aa897ced8e92546b..de5faf5e1e2b3e73bc07fe7f1635110f4efd7eec 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -17,12 +17,15 @@ limitations under the License. */
 #include <fstream>
 #include "paddle/utils/Logging.h"
 
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "GradientMachineMode.h"
 #include "MultiGradientMachine.h"
 #include "MultiNetwork.h"
-#include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
-#include "hl_gpu.h"
+#endif
 
 namespace paddle {
 
@@ -30,13 +33,16 @@ GradientMachine* GradientMachine::create(
     const ModelConfig& config,
     int mode,
     const std::vector<ParameterType>& parameterTypes) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (auto gm = IGradientMachineMode::tryCreateGradientMachine(mode, config)) {
     return gm;
   }
   if (FLAGS_trainer_count > 1) {
     return new MultiGradientMachine(config, FLAGS_use_gpu);
   }
+#endif
   if (FLAGS_trainer_count == 1) {  // single
+#ifndef PADDLE_MOBILE_INFERENCE
     NeuralNetwork* nn;
     if (config.type() == "multi_nn") {
       /* multi submodel calculate, thread(s) will be initialized inside */
@@ -48,6 +54,9 @@ GradientMachine* GradientMachine::create(
       /* single thread calculate */
       nn = NeuralNetwork::create(config);
     }
+#else
+    NeuralNetwork* nn = NeuralNetwork::create(config);
+#endif
     ParamInitCallback testParamInitCb = [](int paramId, Parameter* para) {
       para->enableType(PARAMETER_VALUE);
     };
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index f9c82a2bef82b4e6bcbf0c73583505d2692f3926..ebfe0573cfdbfb2ef54a29b038e8b85356cc6c27 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -20,13 +20,16 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "TrainerConfig.pb.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
-#include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdaterBase.h"
 #include "paddle/utils/Thread.h"
 
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "paddle/gserver/evaluators/Evaluator.h"
+#endif
+
 namespace paddle {
 /**
  * @brief A gradient machine is capable of calculating some outputs given
@@ -147,6 +150,7 @@ public:
 
   virtual void onPassEnd() = 0;
 
+#ifndef PADDLE_MOBILE_INFERENCE
   /**
    * Create an evaluator which can be used for eval()
    */
@@ -156,6 +160,7 @@ public:
    * evaluate using the given evaluator
    */
   virtual void eval(Evaluator* evaluator) const = 0;
+#endif
 
   std::vector<ParameterPtr>& getParameters() { return parameters_; }
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 26cff3e67710b2f38d93572c5d58849aa94a5135..dbadc352a4ccd7483bf67e1025c212f514e32a24 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -14,15 +14,21 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
+#include "NeuralNetwork.h"
+#include "hl_gpu.h"
+#include "paddle/gserver/layers/AgentLayer.h"
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
 
+#ifdef PADDLE_USE_MKLDNN
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+#endif
+
+#ifndef PADDLE_MOBILE_INFERENCE
 #include "MultiNetwork.h"
-#include "NeuralNetwork.h"
 #include "RecurrentGradientMachine.h"
-#include "hl_gpu.h"
-#include "paddle/gserver/layers/AgentLayer.h"
-#include "paddle/utils/Stat.h"
+#endif
 
 namespace paddle {
 void parameterInitNN(int paramId,
@@ -54,6 +60,7 @@ void parameterInitNN(int paramId,
 }
 
 NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
+#ifndef PADDLE_MOBILE_INFERENCE
   if (config.type() == "recurrent_nn") {
     return newNeuralNetwork("root");
   } else if (config.type() == "multi_nn") {
@@ -61,6 +68,9 @@ NeuralNetwork* NeuralNetwork::create(const ModelConfig& config) {
   } else {
     return newNeuralNetwork();
   }
+#else
+  return new NeuralNetwork();
+#endif
 }
 
 std::map<std::string, bool> NeuralNetwork::dllInitMap;
@@ -294,6 +304,17 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
+void NeuralNetwork::finish() {
+#ifdef PADDLE_USE_MKLDNN
+  FOR_EACH_R(layer, layers_) {
+    MKLDNNLayerPtr dnnLayer = std::dynamic_pointer_cast<MKLDNNLayer>(*layer);
+    if (dnnLayer) {
+      dnnLayer->convertWeightsToPaddle();
+    }
+  }
+#endif
+}
+
 Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
   return getLayer(layerName)->getOutput();
 }
@@ -304,6 +325,8 @@ void NeuralNetwork::onPassEnd() {
   }
 }
 
+#ifndef PADDLE_MOBILE_INFERENCE
+
 class CombinedEvaluator : public Evaluator {
 public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
@@ -466,6 +489,8 @@ Evaluator* NeuralNetwork::makeEvaluator() const {
 
 void NeuralNetwork::eval(Evaluator* evaluator) const { evaluator->eval(*this); }
 
+#endif
+
 void NeuralNetwork::setOutputGrad(const std::vector<Argument>& args) {
   CHECK_GE(outputLayers_.size(), args.size());
   for (size_t i = 0; i < args.size(); ++i) {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 12810f642519b7965fc1b7d751290445e3350dd5..6888380290074318fe7f94d168b2931e776dda47 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -97,9 +97,12 @@ public:
 
   virtual void onPassEnd();
 
+#ifndef PADDLE_MOBILE_INFERENCE
   virtual Evaluator* makeEvaluator() const;
 
   virtual void eval(Evaluator* evaluator) const;
+#endif
+
   virtual void resetState();
   virtual void setOutputGrad(const std::vector<Argument>& args);
 
@@ -131,6 +134,9 @@ public:
 
   const std::string& getName() const { return subModelName_; }
 
+  /// some finish work, like convert the weight format of MKLDNNLayers
+  void finish();
+
 protected:
   /**
    * The constructor of NeuralNetwork.
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index f7a80e23e1bd49549bec57b360587adc6b423794..bc7d1c83a48aefeb4bc6d3baa32b78aba712e58d 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "BatchNormalizationLayer.h"
 #include "Layer.h"
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnBatchNormLayer.h"
 #endif
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.cpp b/paddle/gserver/layers/BatchNormalizationLayer.cpp
index 412762d38475422be98ffeb87ffcfb028c3e035f..dacff25e5927daf9c991577a71be86b160228317 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Stat.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "hl_batch_transpose.h"
 #endif
 #include "BatchNormalizationLayer.h"
@@ -90,7 +90,7 @@ void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {
   size_t batchSize = in->getHeight();
   CHECK_EQ(out->getHeight(), batchSize * imgPixels_);
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
@@ -127,7 +127,7 @@ void BatchNormalizationLayer::shrinkMat(const MatrixPtr& in, MatrixPtr& out) {
   }
   CHECK_EQ(in->getHeight(), static_cast<size_t>(batchSize * imgPixels_));
   if (useGpu_) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     LOG(FATAL) << "paddle is compiled only for cpu";
 #else
     batchTranspose(
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index ce071323ff585d28c9eaf80fec9be2394be526d1..0bb6f84c22eefbfb3678d6f15651f22c91454c2c 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -462,8 +462,8 @@ void LambdaCost::calcGrad(const real* outputScore,
       real score_j = score[index_j];
       real dcgDif = 0;
       if (j < sortSize) {
-        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) /
-                 (std::log(i + 2) - std::log(j + 2));
+        dcgDif = (std::pow(2, score_i) - std::pow(2, score_j)) *
+                 (1 / std::log(i + 2) - 1 / std::log(j + 2));
       } else {
         dcgDif =
             (std::pow(2, score_i) - std::pow(2, score_j)) / std::log(i + 2);
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index e95f42c863b3733ca66055e1b3939e734cae8ad1..01f2aae6cf88d47296da804061b9b039cca593db 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -15,11 +15,14 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "CostLayer.h"
-#include "ValidationLayer.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
 
+#ifndef PADDLE_MOBILE_INFERENCE
+#include "ValidationLayer.h"
+#endif
+
 DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
 namespace paddle {
@@ -103,10 +106,12 @@ LayerPtr Layer::create(const LayerConfig& config) {
     return LayerPtr(new MultiClassCrossEntropy(config));
   else if (type == "rank-cost")
     return LayerPtr(new RankingCost(config));
+#ifndef PADDLE_MOBILE_INFERENCE
   else if (type == "auc-validation")
     return LayerPtr(new AucValidation(config));
   else if (type == "pnpair-validation")
     return LayerPtr(new PnpairValidation(config));
+#endif
 
   return LayerPtr(registrar_.createByType(config.type(), config));
 }
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 4002a3d0747a86ab7b495ffe52247521831b71b8..9813a556076bc2666869a85225feaf10f345217a 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -86,6 +86,7 @@ protected:
   /// Also used in 'use_mkldnn' case.
   std::vector<Argument> outputOtherDevice_;
   /// If there are several outputs, map them by each name.
+  /// MKLDNNLayer use it only to merge output grad
   std::map<std::string, Argument*> outputMap_;
   /// Used to merge grad on different devices.
   MatrixPtr tmpGrad_;
@@ -325,6 +326,11 @@ public:
     outputMap_[name] = output;
   }
 
+  /**
+   * Get the output map size, if layer has multi-output.
+   */
+  size_t getOutputMapSize() { return outputMap_.size(); }
+
   /**
    * Get the output based on layer's name.
    */
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
index 4c0234e7b3a91053596c32cea581fa5d1e26b9d5..af02a37cad668708f77ecf423549a8ec993e54fb 100644
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -21,8 +21,8 @@ namespace paddle {
 typedef enum {
   MKLDNN_BASE = 1,   // basical info of MKLDNN
   MKLDNN_TESTS = 1,  // gtest info of MKLDNN
-  MKLDNN_SIZES = 2,  // size info of MKLDNN
-  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_FMTS = 2,   // format info of MKLDNN
+  MKLDNN_SIZES = 3,  // size info of MKLDNN
   MKLDNN_ALL = 4,    // show all info of MKLDNN
 } MKLDNN_LOG_LEVEL;
 
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 0d6742e909635c1097b4fe21bbb304f8a71af5cb..83f4e4e6151d727b3e6cf367bb7ecae55dd7df73 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -116,8 +116,6 @@ void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdBuffers(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -135,12 +133,6 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNConvLayer::updateInputData() {
-  cpuInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
@@ -211,11 +203,18 @@ void MKLDNNConvLayer::resetFwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(pd);
-  resetInValue(pd, in);
+  resetInValue(
+      in, std::make_shared<memory::primitive_desc>(pd->src_primitive_desc()));
 
-  resetWgtBiasValue(pd, wgt, bias);
+  resetOutValue(out, pd->dst_primitive_desc());
 
-  resetOutValue(pd, out);
+  resetWithMatrix(wgt, weight_->getW(), pd->weights_primitive_desc());
+
+  if (biases_ && biases_->getW()) {
+    resetWithMatrix(bias, biases_->getW(), pd->bias_primitive_desc());
+  } else {
+    bias = nullptr;
+  }
 }
 
 void MKLDNNConvLayer::resetFwdPipeline(
@@ -225,100 +224,12 @@ void MKLDNNConvLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
-  if (cvtInVal_) {
-    pipeline.push_back(*cvtInVal_);
-  }
-
   if (bias) {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
     fwd_.reset(new conv_fwd(*pd, *in, *wgt, *out));
   }
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
-}
-
-void MKLDNNConvLayer::resetInValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& in) {
-  const MatrixPtr& inMat = inputLayers_[0]->getOutput().value;
-  in = MKLDNNMatrix::create(inMat, pd->src_primitive_desc());
-
-  // create buffer and reorder if input value do not match
-  cpuInVal_ = nullptr;
-  cvtInVal_ = nullptr;
-
-  MKLDNNMatrixPtr dnnIn = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
-  CHECK_EQ(inputIsOnlyMKLDNN(), dnnIn != nullptr);
-  if (dnnIn != nullptr && dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-    in = dnnIn;
-    return;
-  }
-  if (dnnIn) {
-    if (dnnIn->getFormat() == format::nc) {
-      CHECK(ih_ == 1 && iw_ == 1) << "when input is nc format";
-      // create a new one with nchw format and same data
-      memory::dims inDims = memory::dims{bs_, ic_, 1, 1};
-      dnnIn = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    }
-    if (dnnIn->getPrimitiveDesc() == in->getPrimitiveDesc()) {
-      in = dnnIn;
-      return;
-    }
-    cpuInVal_ = dnnIn;
-    in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-    cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-    CHECK(cvtInVal_) << "should not be emptry";
-  } else {
-    memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-    cpuInVal_ = MKLDNNMatrix::create(inMat, inDims, format::nchw, engine_);
-    if (cpuInVal_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      // create new mkldnn matrix
-      in = MKLDNNMatrix::create(nullptr, pd->src_primitive_desc());
-      cvtInVal_ = MKLDNNMatrix::createReorder(cpuInVal_, in);
-      CHECK(cvtInVal_) << "should not be emptry";
-    } else {
-      in = cpuInVal_;
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getW(), pd->weights_primitive_desc());
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), pd->bias_primitive_desc())
-             : nullptr;
-}
-
-void MKLDNNConvLayer::resetOutValue(
-    std::shared_ptr<conv_fwd::primitive_desc>& pd, MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, pd->dst_primitive_desc());
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
-    } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
-      cpuOutVal_ = out;
-    }
-  }
 }
 
 void MKLDNNConvLayer::resetBwdWgtPD(
@@ -327,8 +238,8 @@ void MKLDNNConvLayer::resetBwdWgtPD(
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
 
   // create backward weight using input, output and weight value memory desc
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   CHECK(wgtVal_) << "Should have weight value";
   algorithm algo = algorithm::convolution_direct;
   padding_kind padKind = padding_kind::zero;
@@ -368,8 +279,8 @@ void MKLDNNConvLayer::resetBwdDataPD(
 
   memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
   loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
-  CHECK(inVal_) << "Should have input value";
-  CHECK(outVal_) << "Should have output value";
+  CHECK(inVal_) << "Should have internal input value";
+  CHECK(outVal_) << "Should have internal output value";
   // create backward data using input and output value memory desc
   // but using weight memory desc with any format
   auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
@@ -395,12 +306,27 @@ void MKLDNNConvLayer::resetBwdBuffers(
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
   CHECK(wgtPD);
-  resetOutGrad(wgtPD, out);
+  resetOutGrad(out, wgtPD->diff_dst_primitive_desc());
 
-  resetWgtBiasGrad(wgtPD, wgt, bias);
+  resetWithMatrix(
+      wgt, weight_->getWGrad(), wgtPD->diff_weights_primitive_desc());
+  CHECK(wgtVal_ != nullptr &&
+        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
+      << "primitive desc of weight grad and value should be equal";
 
-  resetInGrad(dataPD, in);
+  bias = nullptr;
+  if (biases_ && biases_->getWGrad()) {
+    resetWithMatrix(
+        bias, biases_->getWGrad(), wgtPD->diff_bias_primitive_desc());
+    CHECK(bias && biasVal_ &&
+          bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
+        << "primitive desc of bias grad should equal the bias value";
+  }
 
+  if (dataPD == nullptr) {
+    return;
+  }
+  resetInGrad(in, dataPD->diff_src_primitive_desc());
   resetWgtValBwdData(dataPD, wgtValBwdData_);
 }
 
@@ -412,12 +338,7 @@ void MKLDNNConvLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
-  }
-
+  CHECK(inVal_);
   // add bwdWgt handle
   if (bias) {
     bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
@@ -429,101 +350,13 @@ void MKLDNNConvLayer::resetBwdPipeline(
   if (dataPD == nullptr) {
     return;
   }
-
   if (cvtWgtVal_) {
     pipeline.push_back(*cvtWgtVal_);
   }
-
   // add bwdData handle
   CHECK(wgtValBwdData_) << "Should have weight memory";
   bwdData_.reset(new conv_bwdData(*dataPD, *out, *wgtValBwdData_, *in));
   pipeline.push_back(*bwdData_);
-
-  if (cvtInGrad_) {
-    pipeline.push_back(*cvtInGrad_);
-  }
-}
-
-void MKLDNNConvLayer::resetOutGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, MKLDNNMatrixPtr& out) {
-  const MatrixPtr& outMat = output_.grad;
-  out = MKLDNNMatrix::create(outMat, wgtPD->diff_dst_primitive_desc());
-  CHECK(outVal_ != nullptr &&
-        out->getPrimitiveDesc() == outVal_->getPrimitiveDesc())
-      << "primitive desc of out grad and value should be equal";
-
-  // TODO(TJ): merge outgrad
-  // create reorder if has output grad does not match
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    outMat->setData(cpuOut->getData());
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuOutVal_);
-    cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
-    if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      out = cpuOutGrad_;
-    } else {
-      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_);
-    }
-  }
-}
-
-void MKLDNNConvLayer::resetWgtBiasGrad(
-    std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-    MKLDNNMatrixPtr& wgt,
-    MKLDNNMatrixPtr& bias) {
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(),
-                             wgtPD->diff_weights_primitive_desc());
-  CHECK(nullptr != wgtVal_ &&
-        wgt->getPrimitiveDesc() == wgtVal_->getPrimitiveDesc())
-      << "primitive desc of weight grad and value should be equal";
-  VLOG(MKLDNN_FMTS) << "weight grad format: " << wgt->getFormat();
-
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias = MKLDNNMatrix::create(biases_->getWGrad(),
-                              wgtPD->diff_bias_primitive_desc());
-  CHECK(bias->getPrimitiveDesc() == biasVal_->getPrimitiveDesc())
-      << "primitive desc of bias grad should equal the bias value";
-}
-
-void MKLDNNConvLayer::resetInGrad(
-    std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-    MKLDNNMatrixPtr& in) {
-  if (dataPD == nullptr) {
-    return;
-  }
-
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  in = MKLDNNMatrix::create(inputLayers_[0]->getOutput().grad,
-                            dataPD->diff_src_primitive_desc());
-  CHECK(nullptr != inVal_ &&
-        in->getPrimitiveDesc() == inVal_->getPrimitiveDesc())
-      << "primitive desc of input grad and value should be equal";
-
-  // create reorder if has output grad does not match
-  cpuInGrad_ = nullptr;
-  cvtInGrad_ = nullptr;
-  if (!inputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuIn = getInputGrad(0, CPU_DEVICE);
-    // same PrimitiveDesc with cpuInVal_
-    CHECK(cpuInVal_);
-    cpuInGrad_ = MKLDNNMatrix::create(cpuIn, cpuInVal_->getPrimitiveDesc());
-    if (cpuInGrad_->getPrimitiveDesc() != in->getPrimitiveDesc()) {
-      const MatrixPtr& dnnIn = getInputGrad(0, MKLDNN_DEVICE);
-      in = MKLDNNMatrix::create(dnnIn, in->getPrimitiveDesc());
-      cvtInGrad_ = MKLDNNMatrix::createReorder(in, cpuInGrad_);
-      CHECK(cvtInGrad_);
-    } else {
-      in = cpuInGrad_;
-    }
-  }
 }
 
 void MKLDNNConvLayer::resetWgtValBwdData(
@@ -537,8 +370,7 @@ void MKLDNNConvLayer::resetWgtValBwdData(
   // since the primitive_desc would be different with wgtVal_
   CHECK(wgtVal_) << "should have weight value";
   if (dataPD->weights_primitive_desc() != wgtVal_->getPrimitiveDesc()) {
-    wgtValBwdData_ =
-        MKLDNNMatrix::create(nullptr, dataPD->weights_primitive_desc());
+    wgtValBwdData_ = MKLDNNMatrix::create(dataPD->weights_primitive_desc());
     cvtWgtVal_ = MKLDNNMatrix::createReorder(wgtVal_, wgtValBwdData_);
     CHECK(cvtWgtVal_);
   } else {
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.h b/paddle/gserver/layers/MKLDNNConvLayer.h
index f84f2f737c47a1b8adc2b83360a0396ffbc6ae24..1fed0e1c6565b763a3ee73a0853f560ddfbd44c6 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.h
+++ b/paddle/gserver/layers/MKLDNNConvLayer.h
@@ -48,17 +48,6 @@ protected:
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<conv_fwd::primitive_desc> fwdPD_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuInVal_;
-  MKLDNNMatrixPtr cpuInGrad_;
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtInVal_;
-  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // whether the weight has been init
   bool hasInitedWgt_;
 
@@ -94,8 +83,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -109,26 +96,6 @@ public:
                        << ", sw: " << sw_ << ", dh: " << dh_ << ", dw: " << dw_;
   }
 
-  void printValueFormatFlow() override {
-    if (cpuInVal_) {
-      VLOG(MKLDNN_FMTS) << cpuInVal_->getFormat() << " >>>";
-    }
-    MKLDNNLayer::printValueFormatFlow();
-    if (cpuOutVal_) {
-      VLOG(MKLDNN_FMTS) << " >>> " << cpuOutVal_->getFormat();
-    }
-  }
-
-  void printGradFormatFlow() override {
-    if (cpuInGrad_) {
-      VLOG(MKLDNN_FMTS) << cpuInGrad_->getFormat() << " <<<";
-    }
-    MKLDNNLayer::printGradFormatFlow();
-    if (cpuOutGrad_) {
-      VLOG(MKLDNN_FMTS) << " <<< " << cpuOutGrad_->getFormat();
-    }
-  }
-
 protected:
   /**
    * load the dims settings of this conv
@@ -162,23 +129,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of input value
-   */
-  void resetInValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                    MKLDNNMatrixPtr& in);
-  /**
-   * reset MKLDNNMatrix of weight and bias value
-   */
-  void resetWgtBiasValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                         MKLDNNMatrixPtr& wgt,
-                         MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of output value
-   */
-  void resetOutValue(std::shared_ptr<conv_fwd::primitive_desc>& pd,
-                     MKLDNNMatrixPtr& out);
-
   /**
    * reset the backward weight primitive descriptor.
    */
@@ -207,22 +157,6 @@ protected:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out);
 
-  /**
-   * reset MKLDNNMatrix of output grad
-   */
-  void resetOutGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                    MKLDNNMatrixPtr& out);
-  /**
-   * reset MKLDNNMatrix of weight and bias grad
-   */
-  void resetWgtBiasGrad(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
-                        MKLDNNMatrixPtr& wgt,
-                        MKLDNNMatrixPtr& bias);
-  /**
-   * reset MKLDNNMatrix of input grad
-   */
-  void resetInGrad(std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
-                   MKLDNNMatrixPtr& in);
   /**
    * reset MKLDNNMatrix of weight value for backward data
    * since the primitive_desc would be different with wgtVal_
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index e829456d6afd7cc844f752d4571cd9f90c73997f..d82063a7130ca928ba042e210eb216f90c7207cd 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -62,7 +62,7 @@ void MKLDNNFcLayer::convertWeightsFromPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto srcFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto srcFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataFrom(wgtVal_, srcFmt, targetDim);
   hasInitedWgt_ = true;
 }
@@ -71,7 +71,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
   CHECK(wgtVal_) << "should have been initialized";
   bool hasNoSpatial_ = ih_ == 1 && iw_ == 1;
   auto targetDim = wgtVal_->getDims();
-  auto dstFmt = hasNoSpatial_ ? memory::format::io : memory::format::ihwo;
+  auto dstFmt = hasNoSpatial_ ? format::io : format::ihwo;
   wgtVal_->reorderDataTo(wgtVal_, dstFmt, targetDim);
 }
 
@@ -100,8 +100,6 @@ void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, wgt, bias, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -119,12 +117,6 @@ void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdDataPD(bwdDataPD, in, out);
 
   resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNFcLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
@@ -139,50 +131,29 @@ void MKLDNNFcLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
   resetInValue(in);
-
-  resetWgtBiasValue(wgt, bias);
-
-  resetOutValue(out);
-}
-
-void MKLDNNFcLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
+  CHECK(in);
   in->downSpatial();
-}
 
-void MKLDNNFcLayer::resetWgtBiasValue(MKLDNNMatrixPtr& wgt,
-                                      MKLDNNMatrixPtr& bias) {
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc({bs_, oc_}, format::nc, engine_);
+  resetOutValue(out, outPD);
+
   format wgtFmt = format::oihw;
-  if (inVal_->getFormat() == format::nChw8c) {
+  if (in->getFormat() == format::nChw8c) {
     wgtFmt = format::oIhw8i;
-  } else if (inVal_->getFormat() == format::nChw16c) {
+  } else if (in->getFormat() == format::nChw16c) {
     wgtFmt = format::oIhw16i;
   }
-  wgt = MKLDNNMatrix::create(
-      weight_->getW(), {oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  auto wgtPD =
+      MKLDNNMatrix::createPrimitiveDesc({oc_, ic_, ih_, iw_}, wgtFmt, engine_);
+  resetWithMatrix(wgt, weight_->getW(), wgtPD);
   wgt->downSpatial();
-  VLOG(MKLDNN_FMTS) << "Weight value format: " << wgt->getFormat();
-
-  bias = (biases_ && biases_->getW())
-             ? MKLDNNMatrix::create(biases_->getW(), {oc_}, format::x, engine_)
-             : nullptr;
-}
 
-void MKLDNNFcLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  out = MKLDNNMatrix::create(output_.value, {bs_, oc_}, format::nc, engine_);
-  if (!outputIsOnlyMKLDNN()) {
-    // fc cpu output value do not need create convert
-    // just share point
-    getOutput(CPU_DEVICE).value->setData(out->getData());
+  if (biases_ && biases_->getW()) {
+    auto biasPD = MKLDNNMatrix::createPrimitiveDesc({oc_}, format::x, engine_);
+    resetWithMatrix(bias, biases_->getW(), biasPD);
+  } else {
+    bias = nullptr;
   }
 }
 
@@ -214,14 +185,11 @@ void MKLDNNFcLayer::resetFwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-
   if (bias) {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *bias, *out));
   } else {
     fwd_.reset(new fc_fwd(*pd, *in, *wgt, *out));
   }
-
   pipeline.push_back(*fwd_);
 }
 
@@ -229,51 +197,18 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                     MKLDNNMatrixPtr& wgt,
                                     MKLDNNMatrixPtr& bias,
                                     MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetWgtBiasGrad(wgt, bias);
-
-  resetInGrad(in);
-}
-
-void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  // TODO(TJ): merge outgrad
-  int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
-  output_.grad->setData(getOutput(device).grad->getData());
-  // for MKLDNN device:
-  // can not directly cast outputgrad to mkldnnmatrix,
-  // since each layer can not write the inputgrad to mkldnn inputgrad.
-  // So just create from matrix with outputvalue format.
-  // for CPU device:
-  // fc do not need to convert from cpu device since output is always nc format
-  // only need create from cpu device
-  CHECK(outVal_);
-  out =
-      MKLDNNMatrix::create(getOutput(device).grad, outVal_->getPrimitiveDesc());
-}
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 
-void MKLDNNFcLayer::resetWgtBiasGrad(MKLDNNMatrixPtr& wgt,
-                                     MKLDNNMatrixPtr& bias) {
   CHECK(wgtVal_);
-  wgt = MKLDNNMatrix::create(weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
+  resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
 
-  bias = nullptr;
-  if (biasVal_ == nullptr) {
-    return;
-  }
-  bias =
-      MKLDNNMatrix::create(biases_->getWGrad(), biasVal_->getPrimitiveDesc());
-}
-
-void MKLDNNFcLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
-    return;
+  if (biasVal_) {
+    resetWithMatrix(bias, biases_->getWGrad(), biasVal_->getPrimitiveDesc());
+  } else {
+    bias = nullptr;
   }
-  // TODO(TJ): use outputMaps_ ways to get the inGrad_ when merge outgrad done
-  CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNFcLayer::resetBwdWgtPD(
@@ -314,7 +249,6 @@ void MKLDNNFcLayer::resetBwdPipeline(
     MKLDNNMatrixPtr& wgt,
     MKLDNNMatrixPtr& bias,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   CHECK(inVal_);
   if (bias) {
     bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
index c76878aafab7e986d2bf478eaba02f2f0aced293..ee861763ff3dc10ddb4c119358b80dbe1614aecb 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -66,8 +66,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void updateWeights(const UpdateCallback& callback) override;
 
   void convertWeightsFromPaddle() override;
@@ -84,9 +82,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetWgtBiasValue(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<fc_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr wgt,
@@ -109,9 +104,6 @@ protected:
                        MKLDNNMatrixPtr& wgt,
                        MKLDNNMatrixPtr& bias,
                        MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetWgtBiasGrad(MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& bias);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdWgtPD(std::shared_ptr<fc_bwdWgt::primitive_desc>& pd,
                      MKLDNNMatrixPtr& wgt,
                      MKLDNNMatrixPtr& bias,
diff --git a/paddle/gserver/layers/MKLDNNLayer.cpp b/paddle/gserver/layers/MKLDNNLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6bb19976b5552fcd2e420f03de45c77a90ffb9d2
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.cpp
@@ -0,0 +1,333 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNLayer.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+
+namespace paddle {
+
+bool MKLDNNLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                          << "Please set WITH_MKLDNN=ON "
+                          << "and set use_mkldnn=True";
+  CHECK(!useGpu_) << "Do not support GPU yet";
+
+  // set device id before Layer::init
+  setDevice(MKLDNN_DEVICE);
+  // change param device to MKLDNN device
+  setParamsDevice(MKLDNN_DEVICE, parameterMap);
+  if (!Layer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  setOutputMap();
+  checkCPUOutputsNumber();
+
+  stream_.reset(new MKLDNNStream());
+  engine_ = CPUEngine::Instance().getEngine();
+  return true;
+}
+
+void MKLDNNLayer::forward(PassType passType) {
+  passType_ = passType;
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+    CHECK(!inputLayers_.empty());
+    copySeqInfoToOutputs();
+    size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
+    if (inputElemenCnt_ != elemenCnt) {
+      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+      // reset when input total sizes changed, not only the batchsize
+      inputElemenCnt_ = elemenCnt;
+      pipelineFwd_.clear();
+      reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
+      // all cpu device output grad or value share output's
+      shareCPUDevice();
+      resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
+      // MKLDNNLayer output value should be MKLDNNMatrix
+      // so external output value is necessary.
+      // Then external input value is not necessary,
+      // since input may be mkldnn internal buffer.
+      CHECK(extOutVal_) << "external output value is necessary";
+      output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
+      CHECK(inVal_ && outVal_) << "internal memories are necessary";
+      if (cvtInVal_) {
+        pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
+      }
+      if (cvtOutVal_) {
+        pipelineFwd_.push_back(*cvtOutVal_);
+      }
+      convertWeightsFromPaddle();
+      printSizeInfo();
+      printValueFormat();
+      needResetBwd_ = true;
+    }
+
+    if (inputLayers_[0]->getType() == "data") {
+      // Update input value data when input layer is "data" type,
+      // since the input value data address might be changed.
+      CHECK(extInVal_);
+      extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
+    }
+
+    if (!outputOnlyMKLDNN_) {
+      clearGrads();
+    }
+    stream_->submit(pipelineFwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNLayer::backward(const UpdateCallback& callback) {
+  if (needResetBwd_) {
+    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+    pipelineBwd_.clear();
+    pipelineMergeGrad_.clear();
+    mergeGrad_ = nullptr;
+    resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+    // external output grad is not necessary
+    // since output may be mkldnn internal buffer or merge them directly.
+    CHECK(outGrad_) << "internal output grad is necessary";
+    if (extOutGrad_) {
+      CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
+          << "the external buffer should share the same data with output_.grad";
+    }
+    if (cvtOutGrad_) {
+      pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
+    }
+    if (cvtInGrad_) {
+      pipelineBwd_.push_back(*cvtInGrad_);
+    }
+    printGradFormat();
+    needResetBwd_ = false;
+  }
+
+  // merge grad must before backward activation
+  if (mergeGrad_) {
+    REGISTER_TIMER_INFO("MergeBpGrad", getName().c_str());
+    stream_->submit(pipelineMergeGrad_);
+  }
+  {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    stream_->submit(pipelineBwd_);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    updateWeights(callback);
+  }
+}
+
+void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
+  const Argument& input = inputLayers_[0]->getOutput();
+  batchsize = input.getBatchSize();
+  int h = input.getFrameHeight();
+  int w = input.getFrameWidth();
+  if (h != 0) {
+    height = h;
+  }
+  if (w != 0) {
+    width = w;
+  }
+}
+
+void MKLDNNLayer::reshapeOutput(size_t height, size_t width) {
+  output_.setFrameHeight(height);
+  output_.setFrameWidth(width);
+  for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+    outputOtherDevice_[i].setFrameHeight(height);
+    outputOtherDevice_[i].setFrameWidth(width);
+  }
+}
+
+void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                                  const MatrixPtr& mat,
+                                  memory::primitive_desc pd) {
+  dnn = nullptr;
+  if (mat == nullptr) {
+    return;
+  }
+  dnn = MKLDNNMatrix::create(pd, mat);
+}
+
+void MKLDNNLayer::resetInValue(
+    MKLDNNMatrixPtr& in, const std::shared_ptr<memory::primitive_desc>& intPD) {
+  cvtInVal_ = nullptr;
+  extInVal_ = nullptr;
+  in = nullptr;
+  CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
+  auto extPD = MKLDNNMatrix::createPrimitiveDesc(
+      {bs_, ic_, ih_, iw_}, format::nchw, engine_);
+  const MatrixPtr& inMat = inputLayers_[0]->getOutputValue();
+  in = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
+  CHECK_EQ(inputIsOnlyMKLDNN(), in != nullptr);
+  if (in == nullptr || in->getFormat() == format::nc) {
+    in = MKLDNNMatrix::create(extPD, inMat);
+  }
+  extInVal_ = isPaddleFormat(in->getFormat()) ? in : nullptr;
+  if (in->getFormat() == format::nc) {
+    CHECK(ih_ == 1 && iw_ == 1);
+  }
+  if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
+    return;
+  }
+  // need create reorder
+  in = MKLDNNMatrix::create(*intPD);
+  extInVal_ = extInVal_ ? extInVal_ : MKLDNNMatrix::create(extPD, inMat);
+  cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
+  CHECK(cvtInVal_) << "should not be emptry";
+}
+
+void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
+                                memory::primitive_desc intPD) {
+  cvtOutVal_ = nullptr;
+  out = MKLDNNMatrix::create(intPD, output_.value);
+  extOutVal_ = out;
+  if (outputIsOnlyMKLDNN() || isPaddleFormat(extOutVal_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK_GT(bs_ * oc_ * oh_ * ow_, 0);
+  extOutVal_ = MKLDNNMatrix::create(
+      memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_, output_.value);
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutVal_ = MKLDNNMatrix::createReorder(out, extOutVal_);
+  CHECK(cvtOutVal_) << "should not be empty";
+}
+
+void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
+                              memory::primitive_desc intPD) {
+  cvtInGrad_ = nullptr;
+  extInGrad_ = nullptr;
+  in = nullptr;
+  LayerPtr& input = inputLayers_[0];
+  if (input->getOutputGrad() == nullptr) {
+    // no need input grad
+    return;
+  }
+  CHECK(inputIsOnlyMKLDNN() || input->getOutputMapSize() <= 1)
+      << "only support input is MKLDNN layer or only have one output layer";
+  // when input is a mkldnn branch node,
+  // this layer will save input grad to a internal buffer,
+  // and the mkldnn input layer will merge them to actual prev->output_.grad
+  const MatrixPtr& inMat =
+      input->getOutputMapSize() <= 1 ? input->getOutputGrad() : nullptr;
+  in = MKLDNNMatrix::create(intPD, inMat);
+  Argument& arg = input->getOutput(this->getName());
+  arg.grad = std::dynamic_pointer_cast<Matrix>(in);
+  CHECK(inVal_);
+  CHECK(inVal_->getPrimitiveDesc() == intPD) << "the primitive desc must equal";
+  if (inputIsOnlyMKLDNN()) {
+    return;
+  }
+
+  extInGrad_ = in;
+  if (isPaddleFormat(extInGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  // TODO(TJ): add macro definition to simplify it
+  CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
+      << "should have external input value and the format must be nchw(nc)";
+  extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
+  CHECK(inVal_ != nullptr && inVal_->getPrimitiveDesc() == intPD)
+      << "should have internal input value and primitive desc must equal";
+  in = MKLDNNMatrix::create(intPD);
+  cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
+  CHECK(cvtInGrad_);
+}
+
+void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
+                               memory::primitive_desc intPD) {
+  cvtOutGrad_ = nullptr;
+  extOutGrad_ = nullptr;
+  out = nullptr;
+  MatrixPtr& outMat = output_.grad;
+  out = MKLDNNMatrix::create(intPD, outMat);
+  resetMergeGrad(out);
+  if (outputIsOnlyMKLDNN()) {
+    return;
+  }
+  CHECK_LE(outputMap_.size(), 1U) << "do not support mixed with cpu device";
+  extOutGrad_ = out;
+  if (isPaddleFormat(extOutGrad_->getFormat())) {
+    return;
+  }
+  // need create reorder
+  CHECK(extOutVal_ != nullptr && isPaddleFormat(extOutVal_->getFormat()))
+      << "should have external output value and the format must be nchw(nc)";
+  extOutGrad_ = MKLDNNMatrix::create(extOutVal_->getPrimitiveDesc(), outMat);
+  CHECK(outVal_ != nullptr && outVal_->getPrimitiveDesc() == intPD)
+      << "should have internal output value and primitive desc must equal";
+  out = MKLDNNMatrix::create(intPD);
+  cvtOutGrad_ = MKLDNNMatrix::createReorder(extOutGrad_, out);
+  CHECK(cvtOutGrad_);
+}
+
+void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
+  mergeGrad_ = nullptr;
+  pipelineMergeGrad_.clear();
+  if (outputMap_.size() <= 1 || !outputIsOnlyMKLDNN()) {
+    // do not merge when output is not all MKLDNN or only one output
+    return;
+  }
+  CHECK(out) << "should have reset internal ouput grad";
+  std::vector<double> scales(outputMap_.size(), 1.0);
+  std::vector<memory::primitive_desc> srcPDs;
+  std::vector<primitive::at> srcs;
+  for (auto it = outputMap_.begin(); it != outputMap_.end(); ++it) {
+    MKLDNNMatrixPtr src =
+        std::dynamic_pointer_cast<MKLDNNMatrix>(it->second->grad);
+    CHECK(src) << "should be MKLDNNMatrix";
+    auto srcDims = src->getDims();
+    auto dstDims = out->getDims();
+    CHECK_EQ(srcDims.size(), dstDims.size());
+    for (size_t i = 0; i < srcDims.size(); ++i) {
+      CHECK_EQ(srcDims[i], dstDims[i]);
+    }
+    VLOG(MKLDNN_BASE) << getName() << " has output grad " << it->first
+                      << ", format " << src->getFormat();
+    srcPDs.push_back(src->getPrimitiveDesc());
+    srcs.push_back(*src);
+  }
+
+  // TODO(TJ): remove me when mkldnn sum support different formats
+  for (size_t i = 1; i < srcPDs.size(); ++i) {
+    CHECK(srcPDs[0] == srcPDs[i]);
+  }
+  tmpOutGrad_ = out;
+  tmpCvt_ = nullptr;
+  if (out->getPrimitiveDesc() != srcPDs[0]) {
+    tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
+    tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
+    CHECK(tmpCvt_);
+    pipelineMergeGrad_.push_back(*tmpCvt_);
+  }
+
+  auto sumPD =
+      sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
+  mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
+  pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index c09fd89462ef4fdaeaae3e122f96b0cc6ce373ea..2c21a5b2aaecb17a52a5de9a98664068f2255d83 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -46,6 +46,9 @@ protected:
   // backward also need reset after reset forward handle
   bool needResetBwd_;
 
+  // is output only mkldnn
+  bool outputOnlyMKLDNN_;
+
   // mkldnn engine, stream and primivtives
   mkldnn::engine engine_;
   std::shared_ptr<MKLDNNStream> stream_;
@@ -55,16 +58,47 @@ protected:
   std::vector<mkldnn::primitive> pipelineFwd_;
   std::vector<mkldnn::primitive> pipelineBwd_;
 
-  // MKLDNNMatrixPtr with internal format
+  /* Value and grad are seperated as internal and external buffers.
+   * Each MKLDNNLayer must init or reset internal buffer at least,
+   * and the external buffer format is always nchw of nc(when h==w==1),
+   * which is the same format as paddle.
+   * The output_.value and output_.grad always save the external data,
+   * when mixed with cpu device.
+   * When all layers are mkldnn layers, they could save internal data.
+   */
+  // below MKLDNNMatrix buffers are all internal buffers
   MKLDNNMatrixPtr inVal_;
   MKLDNNMatrixPtr inGrad_;
   MKLDNNMatrixPtr outVal_;
   MKLDNNMatrixPtr outGrad_;
+  // below are external value and grad
+  MKLDNNMatrixPtr extInVal_;
+  MKLDNNMatrixPtr extInGrad_;
+  MKLDNNMatrixPtr extOutVal_;
+  MKLDNNMatrixPtr extOutGrad_;
+  // convert handle between external and internal buffers
+  std::shared_ptr<mkldnn::reorder> cvtInVal_;
+  std::shared_ptr<mkldnn::reorder> cvtInGrad_;
+  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
+  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
+
+  // weight and bias are always internal buffers
   MKLDNNMatrixPtr wgtVal_;
   MKLDNNMatrixPtr wgtGrad_;
   MKLDNNMatrixPtr biasVal_;
   MKLDNNMatrixPtr biasGrad_;
 
+  // merge grad primitive
+  std::shared_ptr<mkldnn::primitive> mergeGrad_;
+  std::vector<mkldnn::primitive> pipelineMergeGrad_;
+  // tmp input argument to save input grad, only used to merge grad
+  Argument tmpInArg_;
+  // since mkldnn sum do not support different formats:
+  // can refer to https://github.com/01org/mkl-dnn/issues/134
+  // so need create reorder manually and save tmp MKLDNNMatrix
+  MKLDNNMatrixPtr tmpOutGrad_;
+  std::shared_ptr<mkldnn::primitive> tmpCvt_;
+
 public:
   explicit MKLDNNLayer(const LayerConfig& config)
       : Layer(config),
@@ -77,6 +111,7 @@ public:
         oh_(0),
         ow_(0),
         needResetBwd_(true),
+        outputOnlyMKLDNN_(false),
         engine_(mkldnn::engine::cpu, 0),
         stream_(nullptr),
         fwd_(nullptr),
@@ -85,82 +120,9 @@ public:
 
   ~MKLDNNLayer() {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
-                            << "Please set WITH_MKLDNN=ON "
-                            << "and set use_mkldnn=True";
-    CHECK(!useGpu_) << "Do not support GPU yet";
-
-    // set device id before Layer::init
-    setDevice(MKLDNN_DEVICE);
-    // change param device to MKLDNN device
-    setParamsDevice(MKLDNN_DEVICE, parameterMap);
-    if (!Layer::init(layerMap, parameterMap)) {
-      return false;
-    }
-    checkCPUOutputsNumber();
-
-    stream_.reset(new MKLDNNStream());
-    engine_ = CPUEngine::Instance().getEngine();
-    return true;
-  }
-
-  void forward(PassType passType) override {
-    passType_ = passType;
-
-    {
-      REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
-      CHECK(!inputLayers_.empty());
-      copySeqInfoToOutputs();
-      size_t elemenCnt = inputLayers_[0]->getOutput().value->getElementCnt();
-      if (inputElemenCnt_ != elemenCnt) {
-        VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-        // reset when input total sizes changed, not only the batchsize
-        inputElemenCnt_ = elemenCnt;
-        reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
-        resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
-        if (outVal_) {
-          // change original output value to mkldnn output value
-          output_.value = std::dynamic_pointer_cast<Matrix>(outVal_);
-        }
-        convertWeightsFromPaddle();
-        needResetBwd_ = true;
-      }
-
-      if (inputLayers_[0]->getType() == "data") {
-        updateInputData();
-      }
-
-      stream_->submit(pipelineFwd_);
-    }
-
-    /* activation */ {
-      REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
-      forwardActivation();
-    }
-  }
-
-  void backward(const UpdateCallback& callback) override {
-    if (needResetBwd_) {
-      VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-      needResetBwd_ = false;
-    }
-    {
-      REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
-      backwardActivation();
-    }
-    {
-      REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-      stream_->submit(pipelineBwd_);
-    }
-
-    {
-      REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
-      updateWeights(callback);
-    }
-  }
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
 
   /**
    * reshape the input image sizes
@@ -171,7 +133,7 @@ public:
       int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
 
   /**
-   * reset the mkldnn forward primitve and memory
+   * reset the mkldnn forward primitve and memories
    * only would be called when input size changes
    */
   virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
@@ -181,7 +143,7 @@ public:
                         MKLDNNMatrixPtr& out) = 0;
 
   /**
-   * reset the mkldnn backward primitve and memory for mkldnn fc
+   * reset the mkldnn backward primitve and memories
    * only would be called when needed
    */
   virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
@@ -190,12 +152,6 @@ public:
                         MKLDNNMatrixPtr& bias,
                         MKLDNNMatrixPtr& out) = 0;
 
-  /**
-   * Update input value data when input layer is "data" type.
-   * Since the input value data address might be changed.
-   */
-  virtual void updateInputData() {}
-
   /**
    * Update weights and biases if necessary.
    */
@@ -222,61 +178,78 @@ protected:
   /**
    * reshape the input image sizes and input batchsize
    */
-  virtual void reshapeInput(int& batchsize, int& height, int& width) {
-    const Argument& input = inputLayers_[0]->getOutput();
-    batchsize = input.getBatchSize();
-    int h = input.getFrameHeight();
-    int w = input.getFrameWidth();
-    if (h != 0) {
-      height = h;
-    }
-    if (w != 0) {
-      width = w;
-    }
-  }
+  void reshapeInput(int& batchsize, int& height, int& width);
 
   /**
    * reshape output image sizes
    */
-  virtual void reshapeOutput(size_t height, size_t width) {
-    output_.setFrameHeight(height);
-    output_.setFrameWidth(width);
-    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
-      outputOtherDevice_[i].setFrameHeight(height);
-      outputOtherDevice_[i].setFrameWidth(width);
-    }
-  }
+  void reshapeOutput(size_t height, size_t width);
 
   /**
-   * print info about sizes
+   * reset MKLDNNMatrix from Matrix and internal primitive desc.
+   * reset nullptr if matrix or primitive desc is empty
    */
-  virtual void printSizeInfo() {
-    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
-                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
-                       << ", oh: " << oh_ << ", ow: " << ow_;
-  }
+  void resetWithMatrix(MKLDNNMatrixPtr& dnn,
+                       const MatrixPtr& mat,
+                       mkldnn::memory::primitive_desc pd);
 
   /**
-   * Print the mkldnn memory format flow of value
+   * reset input value from input MKLDNNMatrix and internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
    */
-  virtual void printValueFormatFlow() {
-    if (inVal_ && outVal_) {
-      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>> "
-                        << outVal_->getFormat();
-    }
-  }
+  void resetInValue(
+      MKLDNNMatrixPtr& in,
+      const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr);
+
+  /**
+   * reset output value from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetOutValue(MKLDNNMatrixPtr& out,
+                     mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset input grad from internal primitive desc.
+   * reset both internal and external buffer and create reorder if necessary.
+   */
+  void resetInGrad(MKLDNNMatrixPtr& in, mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset output grad from internal primitive desc.
+   * merge grad if necessary.
+   * reset both internal and external buffer and create reorder if necessary.
+   * note: about merge grad, when this layer has several outputs,
+   *       it could not be mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetOutGrad(MKLDNNMatrixPtr& out, mkldnn::memory::primitive_desc intPD);
+
+  /**
+   * reset the merge grad primitive if necessary.
+   * note: do not support the grads mixed with cpu device,
+   *       since it can not get memory desc from cpu device.
+   */
+  void resetMergeGrad(MKLDNNMatrixPtr& out);
+
+protected:
+  /**
+   * Set deviceId of this layer.
+   */
+  void setDevice(int id) { deviceId_ = id; }
 
   /**
-   * Print the mkldnn memory format flow of grad
+   * check the format is nchw or nc,
+   * which is supported by Paddle default memory layout
    */
-  virtual void printGradFormatFlow() {
-    if (inGrad_ && outGrad_) {
-      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<< "
-                        << outGrad_->getFormat();
+  bool isPaddleFormat(mkldnn::memory::format fmt) {
+    if (fmt == mkldnn::memory::format::nchw ||
+        fmt == mkldnn::memory::format::nc) {
+      return true;
+    } else {
+      return false;
     }
   }
 
-protected:
   /**
    * If input only has MKLDNN device.
    * Otherwise, only support the previous layer using CPU device.
@@ -286,7 +259,6 @@ protected:
     if (prevDevice == MKLDNN_DEVICE) {
       return true;
     } else {
-      // do not support GPU yet
       CHECK_EQ(prevDevice, CPU_DEVICE) << "Only support CPU yet";
       return false;
     }
@@ -301,15 +273,82 @@ protected:
       CHECK_EQ(outputOtherDevice_[i].deviceId, CPU_DEVICE)
           << "Only support other device is CPU yet";
     }
-    return outputOtherDevice_.size() == 0;
+    outputOnlyMKLDNN_ = outputOtherDevice_.size() == 0;
+    return outputOnlyMKLDNN_;
   }
 
   /**
-   * Set deviceId of this layer.
+   * print info about sizes
    */
-  void setDevice(int id) { deviceId_ = id; }
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  /**
+   * print the mkldnn memory format of value
+   */
+  virtual void printValueFormat() {
+    if (extInVal_) {
+      VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
+    }
+    if (inVal_) {
+      VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
+    }
+    if (outVal_) {
+      VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
+    }
+    if (extOutVal_) {
+      VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
+    }
+    if (wgtVal_) {
+      VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
+    }
+    if (biasVal_) {
+      VLOG(MKLDNN_FMTS) << "Bias value format: " << biasVal_->getFormat();
+    }
+  }
+
+  /**
+   * print the mkldnn memory format of grad
+   */
+  virtual void printGradFormat() {
+    if (extOutGrad_) {
+      VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
+    }
+    if (outGrad_) {
+      VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
+    }
+    if (inGrad_) {
+      VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
+    }
+    if (extInGrad_) {
+      VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
+    }
+    if (wgtGrad_) {
+      VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
+    }
+    if (biasGrad_) {
+      VLOG(MKLDNN_FMTS) << "Bias grad format: " << biasGrad_->getFormat();
+    }
+  }
 
 private:
+  /**
+   * clear all grad
+   */
+  void clearGrads() {
+    if (output_.grad) {
+      output_.grad->zeroMem();
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      if (outputOtherDevice_[i].grad) {
+        outputOtherDevice_[i].grad->zeroMem();
+      }
+    }
+  }
+
   /**
    * Set deviceId of the params used in this layer.
    */
@@ -334,6 +373,29 @@ private:
     }
   }
 
+  /**
+   * Set output map of prev layers.
+   */
+  void setOutputMap() {
+    outputMap_.clear();
+    for (size_t i = 0; i < inputLayers_.size(); ++i) {
+      inputLayers_[i]->setOutput(getName(), &tmpInArg_);
+    }
+  }
+
+  /**
+   * if have cpu device, share value and grad data with output_
+   */
+  void shareCPUDevice() {
+    if (outputIsOnlyMKLDNN()) {
+      return;
+    }
+    for (size_t i = 0; i < outputOtherDevice_.size(); i++) {
+      outputOtherDevice_[i].value = output_.value;
+      outputOtherDevice_[i].grad = output_.grad;
+    }
+  }
+
   /**
    * Check the cpu device number of outputOtherDevice_.
    * should have only one at most.
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.cpp b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
index b62dfb7c54258a593aa50d5b30096423f375c69d..6e89260f49979d4edb4da138507a73dc2bf120de 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.cpp
@@ -85,8 +85,6 @@ void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
   resetFwdPD(fwdPD_, in, out);
 
   resetFwdPipeline(pipeline, fwdPD_, in, out);
-
-  printValueFormatFlow();
 }
 
 void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
@@ -101,62 +99,22 @@ void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
   resetBwdPD(pd, in, out);
 
   resetBwdPipeline(pipeline, pd, in, out);
-
-  printGradFormatFlow();
-}
-
-void MKLDNNPoolLayer::updateInputData() {
-  inVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
 }
 
 void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
   resetInValue(in);
 
-  resetOutValue(out);
-}
-
-void MKLDNNPoolLayer::resetInValue(MKLDNNMatrixPtr& in) {
-  if (inputIsOnlyMKLDNN()) {
-    const MatrixPtr& dnnIn = getInputValue(0);
-    in = std::dynamic_pointer_cast<MKLDNNMatrix>(dnnIn);
-    CHECK(in) << "Input should be MKLDNNMatrix";
-  } else {
-    CHECK_EQ(getPrev(0)->getDeviceId(), CPU_DEVICE) << "Only support CPU yet";
-    const MatrixPtr& cpuIn = getInputValue(0, CPU_DEVICE);
-    in = MKLDNNMatrix::create(
-        cpuIn, {bs_, ic_, ih_, iw_}, format::nchw, engine_);
-  }
-}
-
-void MKLDNNPoolLayer::resetOutValue(MKLDNNMatrixPtr& out) {
-  CHECK(inVal_) << "Should reset input value first";
   memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
-  out = MKLDNNMatrix::create(
-      output_.value, outDims, inVal_->getFormat(), engine_);
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutVal_ = nullptr;
-  cvtOutVal_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).value;
-    cpuOutVal_ = MKLDNNMatrix::create(cpuOut, outDims, format::nchw, engine_);
-    if (cpuOutVal_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      cvtOutVal_ = MKLDNNMatrix::createReorder(out, cpuOutVal_);
-      CHECK(cvtOutVal_) << "should not be emptry";
-    } else {
-      // CPU output share the same data of MKLDNN output
-      cpuOut->setData(out->getData());
-      cpuOutVal_ = out;
-    }
-  }
+  CHECK(in);
+  auto outPD =
+      MKLDNNMatrix::createPrimitiveDesc(outDims, in->getFormat(), engine_);
+  resetOutValue(out, outPD);
 }
 
 void MKLDNNPoolLayer::resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr in,
                                  MKLDNNMatrixPtr out) {
-  memory::dims inDims = memory::dims{bs_, ic_, ih_, iw_};
-  memory::dims outDims = memory::dims{bs_, oc_, oh_, ow_};
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
@@ -187,63 +145,30 @@ void MKLDNNPoolLayer::resetFwdPipeline(
     std::shared_ptr<pool_fwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
   fwd_ = workspace_
              ? std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out, *workspace_))
              : std::make_shared<pool_fwd>(pool_fwd(*pd, *in, *out));
   pipeline.push_back(*fwd_);
-
-  if (cvtOutVal_) {
-    pipeline.push_back(*cvtOutVal_);
-  }
 }
 
 void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
                                       MKLDNNMatrixPtr& out) {
-  resetOutGrad(out);
-
-  resetInGrad(in);
-}
-void MKLDNNPoolLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
-  CHECK(outVal_) << "Should have output value";
-  out = MKLDNNMatrix::create(output_.grad, outVal_->getPrimitiveDesc());
-
-  // create reorder if output value has cpu device and pd do not match
-  cpuOutGrad_ = nullptr;
-  cvtOutGrad_ = nullptr;
-  if (!outputIsOnlyMKLDNN()) {
-    const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
-    cpuOutGrad_ = MKLDNNMatrix::create(
-        cpuOut, memory::dims{bs_, oc_, oh_, ow_}, format::nchw, engine_);
-    if (cpuOutGrad_->getPrimitiveDesc() != out->getPrimitiveDesc()) {
-      cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
-      CHECK(cvtOutGrad_) << "should not be emptry";
-    } else {
-      // share the same data of CPU output
-      output_.grad->setData(cpuOut->getData());
-      out = cpuOutGrad_;
-    }
-  }
-}
-
-void MKLDNNPoolLayer::resetInGrad(MKLDNNMatrixPtr& in) {
-  in = nullptr;
-  const MatrixPtr& inGrad = inputLayers_[0]->getOutput().grad;
-  if (inGrad == nullptr) {
-    return;
-  }
-  CHECK(inVal_);
-  in = MKLDNNMatrix::create(inGrad, inVal_->getPrimitiveDesc());
+  CHECK(inVal_ && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVal_->getPrimitiveDesc());
 }
 
 void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                                  MKLDNNMatrixPtr& in,
                                  MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
   memory::dims kernels = memory::dims{fh_, fw_};
   memory::dims strides = memory::dims{sh_, sw_};
   memory::dims padL = memory::dims{ph_, pw_};
   memory::dims padR = getPaddingR();
-  CHECK(in);
   CHECK(out);
   auto bwdDesc = pool_bwd::desc(poolAlgo_,
                                 in->getMemoryDesc(),
@@ -261,9 +186,8 @@ void MKLDNNPoolLayer::resetBwdPipeline(
     std::shared_ptr<pool_bwd::primitive_desc>& pd,
     MKLDNNMatrixPtr& in,
     MKLDNNMatrixPtr& out) {
-  pipeline.clear();
-  if (cvtOutGrad_) {
-    pipeline.push_back(*cvtOutGrad_);
+  if (pd == nullptr) {
+    return;
   }
 
   bwdData_ =
diff --git a/paddle/gserver/layers/MKLDNNPoolLayer.h b/paddle/gserver/layers/MKLDNNPoolLayer.h
index 891e15a7efcdd2e54f61352efc1ba7345b91c76b..c5ec87828bfb28b4502b4ec6b47287089c514204 100644
--- a/paddle/gserver/layers/MKLDNNPoolLayer.h
+++ b/paddle/gserver/layers/MKLDNNPoolLayer.h
@@ -38,13 +38,6 @@ protected:
   // pooling_avg or pooling_max
   mkldnn::algorithm poolAlgo_;
 
-  // MKLDNNMatrixPtr which should be created from CPU Device
-  MKLDNNMatrixPtr cpuOutVal_;
-  MKLDNNMatrixPtr cpuOutGrad_;
-  // convert handle between CPU device and MKLDNN device
-  std::shared_ptr<mkldnn::reorder> cvtOutVal_;
-  std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
-
   // save forward primitive_desc, which can be used backward
   std::shared_ptr<pool_fwd::primitive_desc> fwdPD_;
   // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
@@ -74,8 +67,6 @@ public:
                 MKLDNNMatrixPtr& bias,
                 MKLDNNMatrixPtr& out) override;
 
-  void updateInputData() override;
-
   void printSizeInfo() override {
     MKLDNNLayer::printSizeInfo();
     VLOG(MKLDNN_SIZES) << getName() << ": fh: " << fh_ << ", fw: " << fw_
@@ -90,8 +81,6 @@ protected:
    *                    reset pipeline.
    */
   void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetInValue(MKLDNNMatrixPtr& in);
-  void resetOutValue(MKLDNNMatrixPtr& out);
   void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr in,
                   MKLDNNMatrixPtr out);
@@ -106,8 +95,6 @@ protected:
    *                     reset pipeline.
    */
   void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
-  void resetOutGrad(MKLDNNMatrixPtr& out);
-  void resetInGrad(MKLDNNMatrixPtr& in);
   void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
                   MKLDNNMatrixPtr& in,
                   MKLDNNMatrixPtr& out);
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 96d5c54accc047b685502a178de2d290f3158731..7b932d5a76e9c4fe7cbe5882bbc19eb3de4b503a 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "CudnnPoolLayer.h"
 #endif
 namespace paddle {
@@ -53,7 +53,7 @@ Layer* PoolLayer::create(const LayerConfig& config) {
   const std::string& pool = config.inputs(0).pool_conf().pool_type();
   if (pool == "max-projection" || pool == "avg-projection") {
     return new PoolProjectionLayer(config);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   } else if (CudnnPoolLayer::typeCheck(pool)) {
     return new CudnnPoolLayer(config);
 #endif
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index de9b8e63dfc4291f8f42ca8c57cb5eb6baed8d8e..329536afaf6d69676e8c39fdf8b6b8cb87ade5fa 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -1,15 +1,17 @@
 # gserver pacakge unittests
 
+if(NOT MOBILE_INFERENCE)
 ################### test_ProtoDataProvider ############
-add_unittest_without_exec(test_ProtoDataProvider
-    test_ProtoDataProvider.cpp)
-
-# test_ProtoDataProvider will mkdir as same name,
-# so if WORKING_DIRECTORY is default directory, then
-# mkdir will get error.
-add_test(NAME test_ProtoDataProvider
-    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+    add_unittest_without_exec(test_ProtoDataProvider
+        test_ProtoDataProvider.cpp)
+
+    # test_ProtoDataProvider will mkdir as same name,
+    # so if WORKING_DIRECTORY is default directory, then
+    # mkdir will get error.
+    add_test(NAME test_ProtoDataProvider
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+endif()
 
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
@@ -24,7 +26,10 @@ if(WITH_MKLDNN)
         test_MKLDNN.cpp
         MKLDNNTester.cpp
         LayerGradUtil.cpp)
-    add_test(NAME test_MKLDNN COMMAND test_MKLDNN)
+    add_test(NAME test_MKLDNN
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
+            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 ################ test_CRFLayerGrad ####################
@@ -98,9 +103,11 @@ add_unittest_without_exec(test_KmaxSeqScore
 add_test(NAME test_KmaxSeqScore
     COMMAND test_KmaxSeqScore)
 
+if(NOT MOBILE_INFERENCE)
 ################## test_Evaluator #######################
-add_unittest(test_Evaluator
-    test_Evaluator.cpp)
+    add_unittest(test_Evaluator
+        test_Evaluator.cpp)
+endif()
 
 ################ test_LinearChainCRF ####################
 add_simple_unittest(test_LinearChainCRF)
@@ -131,27 +138,31 @@ if(NOT WITH_DOUBLE)
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
+if(NOT MOBILE_INFERENCE)
 ############### test_RecurrentGradientMachine ###############
-# TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-# I will fix it.
-add_unittest_without_exec(test_RecurrentGradientMachine
-    test_RecurrentGradientMachine.cpp)
-add_test(NAME test_RecurrentGradientMachine
-    COMMAND .set_python_path.sh -d
-            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-            ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-
-add_unittest_without_exec(test_NetworkCompare
-    test_NetworkCompare.cpp)
-if(WITH_GPU)
-    add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-else()
-    add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+  # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
+  # I will fix it.
+  add_unittest_without_exec(test_RecurrentGradientMachine
+      test_RecurrentGradientMachine.cpp)
+  add_test(NAME test_RecurrentGradientMachine
+      COMMAND .set_python_path.sh -d
+              ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+              ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+endif()
+
+if(NOT MOBILE_INFERENCE)
+    add_unittest_without_exec(test_NetworkCompare
+        test_NetworkCompare.cpp)
+    if(WITH_GPU)
+        add_test(NAME test_NetworkCompare
+            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+    else()
+        add_test(NAME test_NetworkCompare
+            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
+            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
+    endif()
 endif()
 
 
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index a38880e14cdfcef05461dae567d198e5400c6bb1..cd957c7c0bca4c6089cc07e8f4226b8260190f07 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -674,7 +674,7 @@ void testLayerGradKernel(TestConfig testConf,
                          bool useGpu,
                          bool useWeight,
                          float epsilon) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 88e831f78bd165f63806df6c081d84411be51502..e10a27eedfa3d207d77a9bf1c5bfb23480dcca69 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "paddle/testing/TestUtil.h"
 using namespace std;  // NOLINT
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
index f59618be9d09d146be52fb51cae84f4d24c15ef1..0a19fe23336ea943cb8a572dc40f8c0fbbd7236a 100644
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "MKLDNNTester.h"
 #include "paddle/gserver/layers/MKLDNNBase.h"
 #include "paddle/gserver/layers/MKLDNNLayer.h"
+#include "paddle/trainer/Trainer.h"
 
 namespace paddle {
 
@@ -96,7 +97,7 @@ void MKLDNNTester::randomWgtDatas() {
     parameters_[REF][i]->randomize();
     dnnValue->copyFrom(*refValue);
 
-    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_TESTS) << "Random weight " << parameters_[DNN][i]->getName();
     printVector(dnnValue);
   }
 }
@@ -108,7 +109,7 @@ void MKLDNNTester::randomBotDatas() {
     dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
     dataLayers_[DNN][i]->getOutputValue()->copyFrom(
         *(dataLayers_[REF][i]->getOutputValue()));
-    VLOG(lvl_) << "Input " << i << " data:";
+    VLOG(MKLDNN_TESTS) << "Random Foward, InputValue " << i;
     printMatrix(dataLayers_[REF][i]->getOutputValue());
   }
 }
@@ -117,28 +118,28 @@ void MKLDNNTester::randomTopDiffs() {
   refLayer_->getOutputGrad()->randomizeUniform();
   dnnLayer_->getOutput(CPU_DEVICE)
       .grad->copyFrom(*(refLayer_->getOutputGrad()));
-  VLOG(lvl_) << "Random Backward Input, TopDiff: ";
+  VLOG(MKLDNN_TESTS) << "Random Backward, OutputGrad";
   printMatrix(refLayer_->getOutputGrad());
 }
 
 void MKLDNNTester::checkForward() {
-  VLOG(MKLDNN_ALL) << "Check Forward";
+  VLOG(MKLDNN_TESTS) << "Check Forward";
   printTopDatas();
-  double delta = compareMatrix(dnnLayer_->getOutput(CPU_DEVICE).value,
-                               refLayer_->getOutputValue());
+  double delta =
+      compareMatrix(dnnLayer_->getOutputValue(), refLayer_->getOutputValue());
   EXPECT_LE(fabs(delta), eps_);
 }
 
 void MKLDNNTester::checkBackwardData() {
-  VLOG(MKLDNN_ALL) << "Check Backward Data";
+  VLOG(MKLDNN_TESTS) << "Check Backward Data";
   // TODO(TJ): uncomment me when batch norm ready
   // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
   for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
     const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
     const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
-    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "MKLDNN Backward Result: InputGrad " << i;
     printMatrix(dnnDiff);
-    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    VLOG(MKLDNN_ALL) << "Reference Backward Result: InputGrad " << i;
     printMatrix(refDiff);
 
     double delta = compareMatrix(dnnDiff, refDiff);
@@ -152,7 +153,7 @@ void MKLDNNTester::checkBackwardData() {
 }
 
 void MKLDNNTester::checkBackwardWgts() {
-  VLOG(MKLDNN_ALL) << "Check Backward Weight";
+  VLOG(MKLDNN_TESTS) << "Check Backward Weight";
   CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
   vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
   saveWgt(parameters_[DNN], dnnWgts);
@@ -164,9 +165,11 @@ void MKLDNNTester::checkBackwardWgts() {
   for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
     const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
     const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
-    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    VLOG(MKLDNN_ALL) << "MKLDNN Result: weight value"
+                     << parameters_[DNN][i]->getName();
     printVector(dnn);
-    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    VLOG(MKLDNN_ALL) << "Reference Result: weight value "
+                     << parameters_[REF][i]->getName();
     printVector(ref);
 
     double delta = compareVector(dnn, ref);
@@ -239,7 +242,8 @@ void MKLDNNTester::printTopDatas() {
   }
 
   for (int n = 0; n < NUM; ++n) {
-    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    VLOG(MKLDNN_ALL) << testLayers_[n]->getType()
+                     << " Forward Result: OutputValue";
     printMatrix(testLayers_[n]->getOutputValue());
   }
 }
@@ -251,7 +255,7 @@ void MKLDNNTester::printMatrix(const MatrixPtr& m) {
 
   std::ostringstream ostr;
   m->print(ostr);
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
 void MKLDNNTester::printVector(const VectorPtr& v) {
@@ -261,7 +265,7 @@ void MKLDNNTester::printVector(const VectorPtr& v) {
 
   std::ostringstream ostr;
   v->print(ostr, v->getSize());
-  VLOG(lvl_) << std::endl << ostr.str();
+  VLOG(MKLDNN_ALL) << std::endl << ostr.str();
 }
 
 double MKLDNNTester::getDelta(const real* d1,
@@ -313,8 +317,9 @@ void MKLDNNTester::runOnce() {
   UpdateCallback updateCallback = [](Parameter* para) {
     auto& grad = para->getBuf(PARAMETER_GRADIENT);
     auto& value = para->getBuf(PARAMETER_VALUE);
-    real lr = 1e-3;
+    real lr = 1e-2;
     value->add(*grad, lr);
+    grad->zeroMem();
   };
   randomTopDiffs();
   dnnLayer_->backward(updateCallback);
@@ -338,10 +343,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
                        size_t batchSize,
                        size_t inputImgH,
                        size_t inputImgW,
+                       bool printDetails,
                        size_t iter,
-                       float epsilon,
-                       bool log,
-                       int level) {
+                       float epsilon) {
   CHECK(dnn.layerConfig.type().compare(0, 7, "mkldnn_") == 0 ||
         dnn.layerConfig.active_type().compare(0, 7, "mkldnn_") == 0)
       << "should be MKLDNN layer or MKLDNN activation";
@@ -357,10 +361,9 @@ void MKLDNNTester::run(const TestConfig& dnn,
 
   ih_ = inputImgH;
   iw_ = inputImgW;
+  log_ = printDetails;
   iter_ = iter;
   eps_ = epsilon;
-  log_ = log;
-  lvl_ = level;
 
   // Firstly test mkldnn init from PARAM_FORMAT_ORIGINAL weight
   reset(dnn, ref, batchSize);
@@ -411,4 +414,146 @@ void MKLDNNTester::run(const TestConfig& dnn,
   }
 }
 
+void MKLDNNTester::initArgument(DataIn& data,
+                                const std::string& configPath,
+                                const size_t iter) {
+  TrainerConfigHelper config(configPath);
+  size_t batchSize = config.getOptConfig().batch_size();
+  data.inArgs.resize(iter);
+  data.outGrads.resize(iter);
+  data.paraValues.clear();
+  for (const auto& layer_name : config.getModelConfig().input_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      Argument arg;
+      arg.value = Matrix::create(batchSize, layerSize, false, false);
+      arg.grad = Matrix::create(batchSize, layerSize, false, false);
+      arg.value->randomizeUniform();
+      arg.value->add(-0.5);
+      arg.value->sigmoid(*arg.value);
+      arg.grad->zeroMem();
+      arg.ids = VectorT<int>::create(batchSize, false);
+      arg.ids->rand(layerSize);
+      generateSequenceStartPositions(batchSize, arg.sequenceStartPositions);
+      data.inArgs[i].push_back(arg);
+    }
+  }
+
+  for (const auto& layer_name : config.getModelConfig().output_layer_names()) {
+    auto layer_config = std::find_if(config.getModelConfig().layers().begin(),
+                                     config.getModelConfig().layers().end(),
+                                     [=](const LayerConfig& layer_config) {
+                                       return layer_config.name() == layer_name;
+                                     });
+    CHECK(layer_config != config.getModelConfig().layers().end());
+
+    size_t layerSize = layer_config->size();
+    for (size_t i = 0; i < iter; ++i) {
+      MatrixPtr grad = Matrix::create(batchSize, layerSize, false, false);
+      grad->randomizeUniform();
+      data.outGrads[i].push_back(grad);
+    }
+  }
+
+  for (const auto& para_config : config.getModelConfig().parameters()) {
+    VectorPtr value = Vector::create(para_config.size(), false);
+    value->randnorm(0, 2);
+    data.paraValues.push_back(value);
+  }
+}
+
+void MKLDNNTester::getOutResult(const std::string& configPath,
+                                DataIn& in,
+                                DataOut& out,
+                                bool use_mkldnn,
+                                size_t iter) {
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = use_mkldnn;
+  *ThreadLocalRand::getSeed() = 1;
+  srand(1);
+
+  Trainer trainer;
+  auto config = std::make_shared<TrainerConfigHelper>(configPath);
+  trainer.init(config, false);
+  auto gradientMachine = trainer.getGradientMachine();
+  std::vector<ParameterPtr> parameters = gradientMachine->getParameters();
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    parameters[i]->getBuf(PARAMETER_VALUE)->copyFrom(*in.paraValues[i]);
+  }
+  UpdateCallback simpleUpdate = [](Parameter* para) {
+    auto& grad = para->getBuf(PARAMETER_GRADIENT);
+    auto& value = para->getBuf(PARAMETER_VALUE);
+    real lr = 1e-2;
+    value->add(*grad, lr);
+    grad->zeroMem();
+  };
+
+  vector<Argument> outArgs;
+  gradientMachine->start();
+  out.outValues.clear();
+  out.paraValues.clear();
+  for (size_t i = 0; i < iter; ++i) {
+    VLOG(MKLDNN_TESTS) << "runing iteration " << i;
+    gradientMachine->forward(in.inArgs[i], &outArgs, PASS_TRAIN);
+    // save forward result
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      MatrixPtr value = Matrix::create(outArgs[k].value->getHeight(),
+                                       outArgs[k].value->getWidth(),
+                                       false,
+                                       false);
+      value->copyFrom(*outArgs[k].value);
+      out.outValues.push_back(value);
+    }
+
+    // random backward input
+    for (size_t k = 0; k < outArgs.size(); k++) {
+      outArgs[k].grad->copyFrom(*in.outGrads[i][k]);
+    }
+    gradientMachine->backward(simpleUpdate);
+  }
+  gradientMachine->finish();
+
+  // save param value
+  for (size_t i = 0; i < in.paraValues.size(); i++) {
+    VectorPtr val = Vector::create(
+        parameters[i]->getBuf(PARAMETER_VALUE)->getSize(), false);
+    val->copyFrom(*parameters[i]->getBuf(PARAMETER_VALUE));
+    out.paraValues.push_back(val);
+  }
+}
+
+void MKLDNNTester::compareResult(DataOut& ref, DataOut& dnn, float eps) {
+  CHECK_EQ(ref.outValues.size(), dnn.outValues.size());
+  CHECK_EQ(ref.paraValues.size(), dnn.paraValues.size());
+  VLOG(MKLDNN_TESTS) << "compare value size: " << ref.outValues.size();
+  for (size_t i = 0; i < ref.outValues.size(); i++) {
+    EXPECT_LE(fabs(compareMatrix(ref.outValues[i], dnn.outValues[i])), eps);
+  }
+  VLOG(MKLDNN_TESTS) << "compare param size: " << ref.outValues.size();
+  for (size_t i = 0; i < ref.paraValues.size(); i++) {
+    EXPECT_LE(fabs(compareVector(ref.paraValues[i], dnn.paraValues[i])), eps);
+  }
+}
+
+void MKLDNNTester::runBranchesTest(const std::string& configPath,
+                                   size_t iter,
+                                   float eps) {
+  DataIn in;
+  initArgument(in, configPath, iter);
+  DataOut outCpu, outDnn;
+  VLOG(MKLDNN_TESTS) << "runing cpu network";
+  getOutResult(configPath, in, outCpu, false, iter);
+  VLOG(MKLDNN_TESTS) << "runing mkldnn network";
+  getOutResult(configPath, in, outDnn, true, iter);
+
+  compareResult(outCpu, outDnn, eps);
+}
+
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
index 171d176ee757f1164c38d86273bdf9e5aefeda06..c385d1c72717d120211f167b5c5eb9a557da3714 100644
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -33,6 +33,17 @@ class MKLDNNTester {
     NUM = 2,  // Number of total
   };
 
+  struct DataIn {
+    std::vector<std::vector<Argument>> inArgs;
+    std::vector<std::vector<MatrixPtr>> outGrads;
+    std::vector<VectorPtr> paraValues;
+  };
+
+  struct DataOut {
+    std::vector<MatrixPtr> outValues;
+    std::vector<VectorPtr> paraValues;
+  };
+
 protected:
   std::vector<TestConfig> configs_;
   vector<string> layerNames_;
@@ -47,8 +58,6 @@ protected:
   size_t iter_;
   /// whether to print out the details
   bool log_;
-  /// vlog level to print the matrix details datas
-  int lvl_;
   /// epsilon
   float eps_;
   /// input image size, default 1
@@ -59,7 +68,6 @@ public:
     iter_ = iter;
     eps_ = epsilon;
     log_ = false;
-    lvl_ = MKLDNN_ALL;
   }
 
   ~MKLDNNTester() {}
@@ -70,11 +78,20 @@ public:
            size_t batchSize,
            size_t inputImgH = 1,
            size_t inputImgW = 1,
+           bool printDetails = false,
            size_t iter = 3,
-           float epsilon = 1e-4,
-           bool log = false,
-           int level = MKLDNN_ALL);
-  void setLogLevel(int lvl) { lvl_ = lvl; }
+           float epsilon = 1e-4);
+  static void runBranchesTest(const std::string& configPath,
+                              size_t iter = 3,
+                              float eps = 1e-4);
+  static void initArgument(DataIn& data,
+                           const std::string& configPath,
+                           size_t iter = 3);
+  static void getOutResult(const std::string& configPath,
+                           DataIn& in,
+                           DataOut& out,
+                           bool use_mkldnn,
+                           size_t iter = 3);
 
 private:
   void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
@@ -101,8 +118,9 @@ private:
   void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
   void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
 
-  double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
-  double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  static double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+  static void compareResult(DataOut& ref, DataOut& dnn, float eps = 1e-4);
 
   /**
    * Get delta percent
@@ -111,11 +129,11 @@ private:
    * else return sum(abs(a-b)) / sum(abs(b))
    * The return value should be smaller than eps when passing.
    */
-  double getDelta(const real* d1,
-                  const real* d2,
-                  size_t len,
-                  const float failRate = 1e-3,
-                  const float thres = 0.1);
+  static double getDelta(const real* d1,
+                         const real* d2,
+                         size_t len,
+                         const float failRate = 1e-3,
+                         const float thres = 0.1);
 };
 
 }  //  namespace paddle
diff --git a/paddle/gserver/tests/mkldnn_branches_conv.conf b/paddle/gserver/tests/mkldnn_branches_conv.conf
new file mode 100644
index 0000000000000000000000000000000000000000..2628509db43e6a5f69a4f5ea956bffdc2837e32a
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branches_conv.conf
@@ -0,0 +1,56 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_conv(input, group_name):
+  out1 = img_conv_layer(input=input,
+            name=group_name+'_conv1',
+            filter_size=1,
+            num_filters=channels,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+  out2 = img_conv_layer(input=input,
+            name=group_name+'_conv2',
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+conv = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1, a2 = two_conv(input=conv, group_name='a')
+
+concat = concat_layer(input=[a1, a2])
+
+b1, b2 = two_conv(input=conv, group_name='b')
+
+addto = addto_layer(input=[b1, b2])
+
+outputs([concat, addto])
diff --git a/paddle/gserver/tests/mkldnn_branches_fc.conf b/paddle/gserver/tests/mkldnn_branches_fc.conf
new file mode 100644
index 0000000000000000000000000000000000000000..fb85425c2b63c7604d636e2b0c5d20d91fb5de1b
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branches_fc.conf
@@ -0,0 +1,58 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_fc(input, group_name):
+  out1 = fc_layer(input=input,
+            name=group_name+'_fc1',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+
+  out2 = fc_layer(input=input,
+            name=group_name+'_fc2',
+            size=channels,
+            bias_attr=False,
+            act=LinearActivation())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+conv = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=LinearActivation())
+
+pool = img_pool_layer(input=conv,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+a1, a2 = two_fc(input=pool, group_name='a')
+
+concat = concat_layer(input=[a1, a2])
+
+b1, b2 = two_fc(input=pool, group_name='b')
+
+addto = addto_layer(input=[b1, b2])
+
+outputs([concat, addto])
diff --git a/paddle/gserver/tests/mkldnn_branches_pool.conf b/paddle/gserver/tests/mkldnn_branches_pool.conf
new file mode 100644
index 0000000000000000000000000000000000000000..ca17c74752ab0777a69f818d9f43275a6140cb4c
--- /dev/null
+++ b/paddle/gserver/tests/mkldnn_branches_pool.conf
@@ -0,0 +1,60 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=16)
+channels = get_config_arg("channels", int, 2)
+
+def two_pool(input, group_name):
+  out1 = img_pool_layer(input=input,
+            name=group_name+'_pool1',
+            pool_size=3,
+            stride=2,
+            padding=0,
+            pool_type=MaxPooling())
+
+  out2 = img_pool_layer(input=input,
+            name=group_name+'_pool2',
+            pool_size=5,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+  return out1, out2
+
+data = data_layer(name ="input", size=channels*16*16)
+
+conv = img_conv_layer(input=data,
+            num_channels=channels,
+            filter_size=3,
+            num_filters=channels,
+            padding=1,
+            shared_biases=True,
+            act=LinearActivation())
+
+pool = img_pool_layer(input=conv,
+            pool_size=3,
+            stride=1,
+            padding=1,
+            pool_type=AvgPooling())
+
+a1, a2 = two_pool(input=pool, group_name='a')
+
+concat = concat_layer(input=[a1, a2])
+
+b1, b2 = two_pool(input=pool, group_name='b')
+
+addto = addto_layer(input=[b1, b2])
+
+outputs([concat, addto])
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index de93972a5880518dfbfb9f8582e17c594e54b9b8..f4c2a07c4426da36ff0b0570339a3a972dadec1f 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 659eefa31bdb1f2433d03a59d5bf4782c71bdecf..41116f480957153eca33d211d09095903d6a00d9 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
@@ -119,7 +118,7 @@ TEST(Layer, batchNorm) {
   CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void batchNormInference(int n, int c, int h, int w) {
   MatrixPtr input = std::make_shared<GpuMatrix>(n, c * h * w);
   MatrixPtr cudnnOut = std::make_shared<GpuMatrix>(n, c * h * w);
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
index df14449291e9ec08f45718de07bbb101f6dbea58..f010066ebc6c33eff17715ba20b4e238583f1966 100644
--- a/paddle/gserver/tests/test_CRFLayerGrad.cpp
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/LinearChainCRF.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index 6035a866b4eee4c6a61fa93f3adbf5e1d2d549f7..5f2f9665478ad4bdfb00421ec57b3ecc1b41b417 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/math/MathUtils.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index e7325e0cc3b7195b5fec77c878e3e087cfc643e0..8634355b5206f5cde0aa0717df50ade39e173ae7 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/math/MathUtils.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
@@ -117,7 +116,7 @@ MatrixPtr doOneConvTest(size_t imgSize,
 }
 
 TEST(Layer, convParaUnified) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   MatrixPtr input, resultCpu, resultGpu;
 
   /// TEST1 for conv ///
diff --git a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
index c922237d33da5de0ece61df732334bee5592249d..477638426fe91f2c5b1f4d5011496385f07c2e90 100644
--- a/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
+++ b/paddle/gserver/tests/test_CrossEntropyOverBeamGrad.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
diff --git a/paddle/gserver/tests/test_DetectionOutput.cpp b/paddle/gserver/tests/test_DetectionOutput.cpp
index af43dc51fad35c834635b543b1a016f6d717de1e..dc39c97a87f8b346dc9cc09d6158b1b4069bcf2d 100644
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
@@ -150,7 +150,7 @@ TEST(Layer, detectionOutputLayerFwd) {
                            useGpu,
                            result2);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   // GPU case 1.
   useGpu = true;
   inputLoc = Matrix::create(1, 16, false, useGpu);
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 93996392d221d531f65caf465decbffdbc2d0384..62a131171fa5ae973cb3069151a582aaeac9ee0e 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -51,7 +51,7 @@ void testEvaluator(TestConfig testConf,
                    string testEvaluatorName,
                    size_t batchSize,
                    bool useGpu) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   if (useGpu) return;
 #endif
   FLAGS_use_gpu = useGpu;
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
index 308abe6816428bc0f98ec32e892622fa4a23b1ae..ffe5cfb8dbb55d0b70a5699969abaa101f05f9ce 100644
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
@@ -97,7 +96,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
       Matrix::create(subSeqStartPosition.back(), 1, false, false);
 
   std::vector<bool> mode = {false};
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   mode.push_back(true);
 #endif
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 090bde7b203652e3ffb1662b8f5b8937885d2608..1a46fb49153a0aa4228f58db481b950bc2d6de83 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include <cudnn.h>
 #endif
 #include <gtest/gtest.h>
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/math/MathUtils.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
@@ -258,7 +257,7 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                      true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Projection, conv) {
   /// test ConvProjection
   testProjectionConv(1, false);
@@ -422,7 +421,7 @@ TEST(Layer, depthwiseConvLayer) {
   //  'depthwise_conv' is a sepecial case of 'exconv' whose
   //  groups size equals to the input channels size.
   testDepthwiseConvLayer("exconv", /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testDepthwiseConvLayer("exconv", /* useGpu= */ true);
 #endif
 }
@@ -480,7 +479,7 @@ void testConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, convLayer) {
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testConvLayer("exconv", /* trans= */ false, /* useGpu= */ true);
   testConvLayer("cudnn_conv", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -525,7 +524,7 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -638,7 +637,7 @@ TEST(Layer, SelectiveFullyConnectedLayer) {
                 /* trans= */ false,
                 /* useGup= */ false,
                 false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testLayerGrad(config,
                 "selective_fc",
                 100,
@@ -1210,7 +1209,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
   testLayerGrad(config, "pool", 100, trans, useGpu);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
   TestConfig config;
   config.inputDefs.push_back({INPUT_DATA, "layer_0", 3200, 0});
@@ -1236,7 +1235,7 @@ TEST(Layer, PoolLayer) {
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
   testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
@@ -1309,7 +1308,7 @@ void testPool3DLayer(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, Pool3DLayer) {
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ false);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPool3DLayer("avg", /* trans= */ false, /* useGpu= */ true);
   testPool3DLayer("max", /* trans= */ false, /* useGpu= */ true);
 #endif
@@ -1695,7 +1694,7 @@ void testBatchNormLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, BatchNormalizationLayer) {
   testBatchNormLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNormLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNormLayer("cudnn_batch_norm", false, true);
@@ -1744,7 +1743,7 @@ void testBatchNorm3DLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, testBatchNorm3DLayer) {
   testBatchNorm3DLayer("batch_norm", false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testBatchNorm3DLayer("batch_norm", false, true);
   if (hl_get_cudnn_lib_version() >= int(4000)) {
     testBatchNorm3DLayer("cudnn_batch_norm", false, true);
@@ -2262,7 +2261,7 @@ void test3DConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DConvLayer) {
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   test3DConvLayer("conv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -2339,7 +2338,7 @@ void test3DDeConvLayer(const string& type, bool trans, bool useGpu) {
 
 TEST(Layer, test3DDeConvLayer) {
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   test3DDeConvLayer("deconv3d", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 857d07df3e3088be28943d9e2fe58017e9e57f4a..6cb4ca5e08eab5b979e404c9e09dcfec11086c22 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
+#include <paddle/utils/PythonUtil.h>
 #include <string>
 #include <vector>
 #include "MKLDNNTester.h"
@@ -40,12 +41,13 @@ DECLARE_bool(use_mkldnn);
 struct testFcDesc {
   int bs;
   int ic;
-  int oc;
   int ih, iw;  // oh == ow == 1
+  int oc;
 };
 
 static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_fc");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_size(pm.oc);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -86,6 +88,7 @@ struct testConvDesc {
 
 static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_conv");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_num_filters(pm.oc);
   cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
   cfg.layerConfig.set_shared_biases(true);
@@ -158,6 +161,7 @@ struct testPoolDesc {
 
 static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
   cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_active_type("relu");
   cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -215,13 +219,13 @@ struct testActDesc {
 static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
   cfg.biasSize = 0;
   cfg.layerConfig.set_type("addto");
-  size_t layerSize = pm.ih * pm.ih * pm.iw;
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
   cfg.layerConfig.set_size(layerSize);
   cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
   cfg.layerConfig.add_inputs();
 }
 
-void testActivation(std::string& actType, const testActDesc& pm) {
+void testActivation(std::string actType, const testActDesc& pm) {
   // TODO(TJ): remove me when paddle support elu activation
   if (actType == "mkldnn_elu") {
     return;
@@ -240,16 +244,30 @@ TEST(MKLDNNActivation, Activations) {
   for (auto type : types) {
     /* bs, c, h, w*/
     testActivation(type, {16, 64, 32, 32});
+    testActivation(type, {2, 8, 1, 1});
   }
 }
 
-// TODO(TJ): add branch test
+DECLARE_string(config_args);
+TEST(MKLDNNLayer, branches) {
+  std::vector<std::string> cases = {"conv", "pool", "fc"};
+  for (auto name : cases) {
+    std::string config = "./gserver/tests/mkldnn_branches_" + name + ".conf";
+    for (auto channels : {2, 32}) {
+      std::ostringstream oss;
+      oss << "channels=" << channels;
+      FLAGS_config_args = oss.str();
+      MKLDNNTester::runBranchesTest(config);
+    }
+  }
+}
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   FLAGS_use_gpu = false;
   FLAGS_use_mkldnn = true;
   initMain(argc, argv);
+  initPython(argc, argv);
   FLAGS_thread_local_rand_use_global_seed = true;
   srand(1);
   return RUN_ALL_TESTS();
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index d36f72360f8ebd2033fb3e8c0e1b30911abba362..2b92211936aad1a034369bda0830bed3438cf401 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -243,7 +243,7 @@ TEST(Compare, concat_slice) {
   compareNetwork(config_file_a, config_file_b);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
   std::string config_file_b = "./gserver/tests/img_pool_b.conf";
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index ae0e3bc3d24c54eb84c7b5f5053e629607ef4310..8dc5568784295b5a2e7d4decd178d612432a1a18 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -151,7 +151,7 @@ TEST(Layer, priorBoxLayerFwd) {
                     useGpu,
                     result);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   // reset the input parameters
   variance[1] = 0.1;
   variance[3] = 0.2;
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index e11bf402c27898b8fdbd3fceeb8aeff8906352db..af6472619d1840e82787974d265d601b4a406c09 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -485,7 +485,7 @@ TEST(ProtoDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
@@ -525,7 +525,7 @@ TEST(ProtoDataProvider, constant_slots) {
       for (int numConstantSlots : {1, 2}) {
         for (int useGpu : numTwoArray) {
           for (int dataCompression : numTwoArray) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
@@ -708,7 +708,7 @@ TEST(ProtoSequenceDataProvider, test) {
               // Currently in async mode, useGpu is not supported
               continue;
             }
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             if (useGpu) {
               continue;
             }
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index db883543c306c1938eb9da188ce20ed768018efb..fe54799259d86064c4fcaec0e53707247981a1b4 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -37,7 +37,7 @@ TEST(PyDataProvider, py_fill_slots) {
   config.clear_files();
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
@@ -71,7 +71,7 @@ TEST(PyDataProvider, py_fill_nest_slots) {
   std::string dataFile = "gserver/tests/pyDataProvider/pyDataProviderList";
   config.set_files(dataFile);
   EXPECT_EQ(config.IsInitialized(), true);
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   bool useGpu = false;
 #else
   bool useGpu = true;
diff --git a/paddle/gserver/tests/test_PyDataProvider2.py b/paddle/gserver/tests/test_PyDataProvider2.py
index 2e6225519f4681238f4b40fb33764ead4a16b24a..0d0fe476ff5eac8bf8ad1c9fe09b32c1a8f73ebc 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -51,7 +51,10 @@ def test_sparse_non_value_no_seq(setting, filename):
         yield [(i + 1) * (j + 1) for j in xrange(10)]
 
 
-@provider(input_types=[sparse_vector(30000, seq_type=SequenceType.NO_SEQUENCE)])
+@provider(input_types=[
+    sparse_float_vector(
+        30000, seq_type=SequenceType.NO_SEQUENCE)
+])
 def test_sparse_value_no_seq(setting, filename):
     for i in xrange(200):
         yield [((i + 1) * (j + 1), float(j) / float(i + 1)) for j in xrange(10)]
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index ab23d00a2cb6077147f5b89664a8e2437b4cd63b..d164e382c4a804aef2417135b64cf709474d12f1 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -24,7 +24,6 @@ limitations under the License. */
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
 #include "paddle/math/CpuSparseMatrix.h"
-#include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -321,7 +320,7 @@ TEST(Layer, SelectiveFcLayer_train_dense_mul) {
       "filelist=gserver/tests/SelectiveFcTest/dense_mul_list";
 
   for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
     if (useGpu) {
       break;
     }
@@ -388,7 +387,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
                           outMatSelfc->getWidth(),
                           outMatSelfc->getElementCnt()));
   cpuOutMatSelfc->copyFrom(*outMatSelfc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -418,7 +417,7 @@ void testSelectiveFcLayerTrainSparseMul(const LayerConfig& config,
   MatrixPtr cpuOutMatFc(
       new CpuMatrix(outMatFc->getHeight(), outMatFc->getWidth()));
   cpuOutMatFc->copyFrom(*outMatFc, HPPL_STREAM_DEFAULT);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (useGpu) {
     hl_stream_synchronize(HPPL_STREAM_DEFAULT);
   }
@@ -443,7 +442,7 @@ TEST(Layer, SelectiveFcLayer_train_sparse_mul) {
   selLayerConfig.set_size(fcLayerWidth);
 
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testSelectiveFcLayerTrainSparseMul(selLayerConfig, true);
 #endif
 }
diff --git a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
index e1d4ae16176433b898ba88dd60550e44b4fe37be..3dbffc563462973bdc1da529d486b2a2d5a677d3 100644
--- a/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
+++ b/paddle/gserver/tests/test_SeqSliceLayerGrad.cpp
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "paddle/testing/TestUtil.h"
@@ -195,7 +194,7 @@ TEST(Layer, SeqSliceLayer) {
   vector<vector<real>> ends;
 
   std::vector<bool> mode = {false};
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   mode.push_back(true);
 #endif
   genSeqInfo(seqStartPos, subSeqStartPos);
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 55427e2f12fd7b77c6eea1f65b3229e6fd29d71d..da829460061d38f363317e33daeb65cfa705bb8e 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -199,7 +199,7 @@ TEST(Layer, WarpCTCLayer) {
     for (auto batchSize : {1, 10, 32}) {
       for (auto normByTimes : {false, true}) {
         for (auto useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
           if (useGpu) continue;
 #endif
           LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
diff --git a/paddle/math/MKLDNNMatrix.cpp b/paddle/math/MKLDNNMatrix.cpp
index 0778bb63b7b3bca9b3d2647ca43dad72d783950a..21a8f73c3e650d4b3c3b86247594cd965f4ead35 100644
--- a/paddle/math/MKLDNNMatrix.cpp
+++ b/paddle/math/MKLDNNMatrix.cpp
@@ -18,7 +18,7 @@ using namespace mkldnn;  // NOLINT
 
 namespace paddle {
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::primitive_desc pd, MatrixPtr m) {
   memory::desc md = pd.desc();
   size_t ndims = md.data.ndims;
   int* dims = md.data.dims;
@@ -41,12 +41,12 @@ MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m, memory::primitive_desc pd) {
   return std::make_shared<MKLDNNMatrix>(cpuMatrix, pd);
 }
 
-MKLDNNMatrixPtr MKLDNNMatrix::create(MatrixPtr m,
-                                     memory::dims dims,
+MKLDNNMatrixPtr MKLDNNMatrix::create(memory::dims dims,
                                      memory::format fmt,
                                      engine& eg,
+                                     MatrixPtr m,
                                      mkldnn::memory::data_type dtype) {
-  return create(m, memory::primitive_desc(memory::desc(dims, dtype, fmt), eg));
+  return create(createPrimitiveDesc(dims, fmt, eg, dtype), m);
 }
 
 std::shared_ptr<reorder> MKLDNNMatrix::createReorder(const MKLDNNMatrixPtr& src,
diff --git a/paddle/math/MKLDNNMatrix.h b/paddle/math/MKLDNNMatrix.h
index c843115eb9a5be50d6ff873f1510844228c9d89f..fe755d096da9713e39581a909e5d21aa93d69f0f 100644
--- a/paddle/math/MKLDNNMatrix.h
+++ b/paddle/math/MKLDNNMatrix.h
@@ -40,24 +40,37 @@ public:
   /**
    * Create MKLDNNMatrix from a MatrixPtr and memory primitive_desc
    */
-  static MKLDNNMatrixPtr create(MatrixPtr m, mkldnn::memory::primitive_desc pd);
+  static MKLDNNMatrixPtr create(mkldnn::memory::primitive_desc pd,
+                                MatrixPtr m = nullptr);
 
   /**
    * Create MKLDNNMatrix from a MatrixPtr and memory details info
    */
   static MKLDNNMatrixPtr create(
-      MatrixPtr m,
       mkldnn::memory::dims dims,
       mkldnn::memory::format fmt,
       mkldnn::engine& eg,
+      MatrixPtr m = nullptr,
       mkldnn::memory::data_type dtype = mkldnn::memory::data_type::f32);
 
+  /**
+   * Create primitive descriptor.
+   * default with f32 dtype
+   */
+  static mkldnn::memory::primitive_desc createPrimitiveDesc(
+      const mkldnn::memory::dims dims,
+      const mkldnn::memory::format& fmt,
+      const mkldnn::engine& eg,
+      const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
+    return mkldnn::memory::primitive_desc(memory::desc(dims, dtype, fmt), eg);
+  }
+
   /**
    * Create Memory descriptor.
    * default with any format and f32 dtype
    */
   static mkldnn::memory::desc createMemoryDesc(
-      const mkldnn::memory::dims& dims,
+      const mkldnn::memory::dims dims,
       const mkldnn::memory::format& fmt = mkldnn::memory::format::any,
       const mkldnn::memory::data_type& dtype = mkldnn::memory::data_type::f32) {
     return mkldnn::memory::desc(dims, dtype, fmt);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 0023b4d0f5da500f380ecb836b7c54e050b13d67..c3e34d5309d9ca8a32d7b0a8043e668cdb5be54b 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -670,7 +670,7 @@ void GpuMatrix::leftMul(Matrix& a, real scaleAB, real scaleT) {
 }
 
 void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -694,7 +694,7 @@ void GpuMatrix::selectRows(Matrix& table, IVector& ids) {
 }
 
 void GpuMatrix::addToRows(Matrix& table, IVector& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(dynamic_cast<GpuMatrix*>(&table));
   CHECK(table.useGpu());
   CHECK(ids.useGpu());
@@ -741,7 +741,7 @@ void GpuMatrix::rowMax(Matrix& max) {
 }
 
 void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
index dbb829c4e24a659e4a97c0a3ba4c5c78b68815d3..9ef5b89680b00981188d78cb312dc75e2c0a79ee 100644
--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
@@ -99,7 +99,11 @@ public:
   /**
    * @brief clear local buffer. It only affect auto-growth buffer.
    */
-  inline void clear() { rowStore_.clear(); }
+  inline void clear() {
+    // swap an empty vector to it to free the memory.
+    std::vector<real, AlignedAllocator<real, 32>> empty;
+    rowStore_.swap(empty);
+  }
 
   /**
    * @brief get current number of rows.
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 6370c77386688a334fa0de8b4e2b272882e9e2b0..284b68d590ba655395c0186d8ea86d6855c6fc50 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -836,7 +836,7 @@ void GpuSparseMatrix::zeroMem() {
 }
 
 void GpuSparseMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   CHECK(maxIds.useGpu() && maxVal.useGpu()) << "Matrix type are not equal";
   size_t numSamples = getHeight();
   size_t beam = maxVal.getWidth();
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index eb87ee9bb7936d27c0c32a1a4b35ff49871c0a10..ff72672e3ab77212b309fcfea835839a916fa632 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -172,7 +172,7 @@ void GpuVectorT<T>::isEqualTo(const VectorT<T>& b, const T& value) {
 
 template <class T>
 void GpuVectorT<T>::selectFrom(const VectorT<T>& src, const VectorT<int>& ids) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   hl_vector_select_from<T>(this->getData(),
                            this->getSize(),
                            src.getData(),
@@ -850,7 +850,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
                                 size_t size)
     : sync_(nullptr) {
   CHECK_LE(offset + size, static_cast<size_t>(src.getSize()));
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   SyncedFlag* flag = src.getSync();
   if (*flag == DATA_AT_CPU) {
     src.copyToGpu();  // will set synchronous data between CPU and GPU
@@ -861,7 +861,7 @@ CpuGpuVectorT<T>::CpuGpuVectorT(CpuGpuVectorT<T>& src,
   auto cMemHandle = (src.getVector(false))->getMemoryHandle();
   cpuVectorT_ = std::make_shared<CpuVectorT<T>>(
       size, std::dynamic_pointer_cast<CpuMemoryHandle>(cMemHandle), offset);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   auto gMemHandle = (src.getVector(true))->getMemoryHandle();
   gpuVectorT_ = std::make_shared<GpuVectorT<T>>(
       size, std::dynamic_pointer_cast<GpuMemoryHandle>(gMemHandle), offset);
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 1ca70ea84c867b83013625eaee141f5b75fad4ae..1fecf659e5080c7d25f5f76b92b15f75eaab6ce3 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -68,7 +68,7 @@ void testPoolAllocator() {
 
 TEST(Allocator, Pool) {
   testPoolAllocator<CpuAllocator>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testPoolAllocator<GpuAllocator>();
 #endif
 }
@@ -92,7 +92,7 @@ TEST(MemoryHandle, Cpu) {
   EXPECT_EQ(ptr1, ptr2);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MemoryHandle, Gpu) {
   int numGpu = hl_get_device_count();
 
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 22ce39701fca7b650fc03794cb0701e0987d2dae..1766257860b0b13e9f0ce898438e7c2d644f545e 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithoutArg to compares the
  * implementation of CPU and GPU member function in
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 58bc43a38ba9465a832fcd0652e6309c403577e3..c72f89c8244b1209e490b09387c2ee6352426ce1 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 04c856453d2ec4ad764e37ae430e3e30ac0dea0b..25e0ba11ded96dd78aedc3c297507d0555d80d74 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -94,7 +94,7 @@ void testWrapper(F&& f) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(ExecViaCpu, test1) {
   testWrapper(f);
   testWrapper(&f);
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index e6b5dba446b5a0022ade76b188895c4e0e2a22b4..d9f146f0d1f63480ddee784071b43ff85da0b15c 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
@@ -162,4 +162,4 @@ int main(int argc, char** argv) {
   return RUN_ALL_TESTS();
 }
 
-#endif /* PADDLE_ONLY_CPU */
+#endif
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 1c21da5b76e95603258a5006d0c57b00126e65b9..2f99fa3581e14b91acc0b294856619f4ae2b3483 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /**
  * This test file use autotest::AutoCompare and cmpWithArg to compares the
  * implementation of CPU and GPU member function in Matrix.cpp.
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index c0572dfdbf738a4dfad04811b3a3e1b65487ff6d..8abbe8d82e02b7d1738fe7e6d0c8d494166e7892 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -47,7 +47,7 @@ struct MatrixPara {
   SparseFormat format;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 void test_sparse_matrix_mul(MatrixPara paraA,
                             MatrixPara paraB,
                             MatrixPara paraC) {
@@ -452,7 +452,7 @@ TEST(Matrix, SparseMatrixCSRFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSR, true);
   matC->trimFrom(*mat);
@@ -546,7 +546,7 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   matB->trimFrom(*mat);
   checkSMatrixEqual2(matA, matB);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuSparseMatrixPtr matC = std::make_shared<GpuSparseMatrix>(
       height, trimedWidth, height, FLOAT_VALUE, SPARSE_CSC, true);
   matC->trimFrom(*mat);
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 31b693afa8bd50f77a8efb67769e6215dd755bd3..d03698dee25fdd6dd49f2a3fdb5c605333440f49 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -270,7 +270,7 @@ TEST(Unary, BaseOp) {
   TestUnaryVectorT<CpuIVector, int> testCpuIVector(
       testUnaryBaseOpInt<CpuIVector>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
   TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
   TestUnaryVectorT<GpuIVector, int> testGpuIVector(
@@ -317,7 +317,7 @@ void testUnayrMathOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, MathOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
 #endif
 }
@@ -374,7 +374,7 @@ void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
 TEST(Unary, CompareOp) {
   TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
 #endif
 }
@@ -536,7 +536,7 @@ void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, BaseOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
 #endif
 }
@@ -710,7 +710,7 @@ void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, MathOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
 #endif
 }
@@ -810,7 +810,7 @@ void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
 TEST(Binary, CompareOp) {
   TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
 #endif
 }
@@ -955,7 +955,7 @@ void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, BaseOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
 #endif
 }
@@ -1058,7 +1058,7 @@ void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
 TEST(Ternary, CompareOp) {
   TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
 #endif
 }
@@ -1086,7 +1086,7 @@ void testQuaternaryAdd(
 TEST(Quaternary, BaseOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
 #endif
 }
@@ -1156,7 +1156,7 @@ void testQuaternaryCompareOp(
 TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 4a88844b43ef40af988d2b391d2bef4568dea9b7..5ae0aa036f6bfc1e5bd4e955277c4efff8c739ce 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -91,7 +91,7 @@ int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
 typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
 
 void testCase(testMatrixFunc matrixFunc) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   for (auto useGpu : {false, true}) {
 #else
   for (auto useGpu : {false}) {
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 4eb9837909ffaaf0f483ab65ece7a0b29fd49319..b70a61976402fd0a7cfee8382fd926fcf28486d5 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   const int nx = 100;
   const int ny = 50;
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 92afab4ff7f5ff4acc219c5ac783733340c5726a..04f23cff55db45c39049538545430bc5996cce5d 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -72,7 +72,7 @@ void testLazyAssign(int height, int width) {
 
 TEST(lazyAssign, CPU) { testMatrixCase(testLazyAssign<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(lazyAssign, GPU) { testMatrixCase(testLazyAssign<GpuMatrix>); }
 #endif
 
@@ -142,6 +142,6 @@ void testSgdUpdate(int height, int width) {
 
 TEST(sgdUpdate, CPU) { testMatrixCase(testSgdUpdate<CpuMatrix>); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_GPU
 TEST(sgdUpdate, GPU) { testMatrixCase(testSgdUpdate<GpuMatrix>); }
 #endif
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 061fb22e3fd744d9d9895fd1008089e4a6ce6a0f..7e5a1db44a5302e3b4e5d2768755824666e880ba 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index 60ebae015381a3901c14d0cd4c1225e54ac5726f..c7c07c817a08d78ddcbf8218e8c4a9d22f4990bc 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <gtest/gtest.h>
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index a9185a4b24b13ca0287b0f67375c4599e8b9ac78..2b2a391b9d04a9f7fa4986a6b6dd5cd8e5385f1f 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 /// This unittest checks GpuSparseMatrix/CpuSparseMatrix get same result,
 //  so disable when
 /// only cpu version.
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index bb44970109c05d239e6b92d90b2079b752fa0104..e212f7737a4093125857126cabb5b1a7b3e055b1 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -175,14 +175,14 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
 }
 
 BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (system_allocator_->UseGpu()) {
     if ((total_used_ + total_free_) == 0) {
       // Compute the maximum allocation size for the first allocation.
       max_chunk_size_ = platform::GpuMaxChunkSize();
     }
   }
-#endif  // PADDLE_ONLY_CPU
+#endif
 
   // Allocate a new maximum sized block
   size_t index = 0;
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index a270bd59581520859d43cddd2fc0cfa72080f46d..33166d9ce23a4a345fc00a65adf63281b13643c3 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -62,7 +62,7 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool CPUAllocator::UseGpu() const { return false; }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
@@ -134,7 +134,7 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
 
 bool GPUAllocator::UseGpu() const { return true; }
 
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 82ba322e057575c460b1d51d719c9b0fa459273e..552cab4f96ff21a6f3c66209eb62150e92996826 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -40,7 +40,7 @@ class CPUAllocator : public SystemAllocator {
   virtual bool UseGpu() const;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 class GPUAllocator : public SystemAllocator {
  public:
   virtual void* Alloc(size_t& index, size_t size);
@@ -51,7 +51,7 @@ class GPUAllocator : public SystemAllocator {
   size_t gpu_alloc_size_ = 0;
   size_t fallback_alloc_size_ = 0;
 };
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index ba44e06ddb68e92e4086a8006b868557b0c89b50..6a8558937bf0c924e5f48605ff066e2789fd59b6 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -56,10 +56,10 @@ TEST(CPUAllocator, LockMem) {
   TestAllocator(a, 0);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
 }
-#endif  // PADDLE_ONLY_CPU
+#endif
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index c96a697a7e022684688b31c05da43e52812100d8..1df88a6da9fb0c50d0d7ecd083c0533d8a886a67 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -26,7 +26,7 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
   std::memcpy(dst, src, num);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <>
 void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                   void* dst,
@@ -89,7 +89,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
   platform::GpuMemcpySync(dst, src, num, cudaMemcpyDeviceToDevice);
 }
 
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
index 2b9c0eada6e8406fc81baec7f331a8dd5b8b0ec1..9b36182c2b619317da31310141823442d8fd3f94 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -33,7 +33,7 @@ namespace memory {
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 /**
  * \brief   Copy memory from one place to another place.
@@ -53,7 +53,7 @@ template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
           cudaStream_t stream);
 
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 29bc26f9d3bca0e30896657431f9a9bb1dac0d1d..8e561528f0e7e6ff524fc51b4776efc4e5bd28cd 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -14,11 +14,6 @@ limitations under the License. */
 
 #include "paddle/memory/memory.h"
 
-#include <algorithm>  // for transform
-#include <cstring>    // for memcpy
-#include <memory>     // for unique_ptr
-#include <mutex>      // for call_once
-
 #include "glog/logging.h"
 
 #include "paddle/memory/detail/buddy_allocator.h"
@@ -32,19 +27,14 @@ namespace memory {
 
 using BuddyAllocator = detail::BuddyAllocator;
 
-std::once_flag cpu_allocator_flag;
-std::once_flag gpu_allocator_flag;
-
 BuddyAllocator* GetCPUBuddyAllocator() {
-  static std::unique_ptr<BuddyAllocator> a{nullptr};
-
-  std::call_once(cpu_allocator_flag, [&]() {
-    a.reset(new BuddyAllocator(new detail::CPUAllocator,
-                               platform::CpuMinChunkSize(),
-                               platform::CpuMaxChunkSize()));
-  });
-
-  return a.get();
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
+  }
+  return a;
 }
 
 template <>
@@ -62,38 +52,27 @@ size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
   return GetCPUBuddyAllocator()->Used();
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  using BuddyAllocVec = std::vector<BuddyAllocator*>;
-  static std::unique_ptr<BuddyAllocVec, void (*)(BuddyAllocVec * p)> as{
-      new BuddyAllocVec, [](BuddyAllocVec* p) {
-        std::for_each(p->begin(), p->end(),
-                      [](BuddyAllocator* p) { delete p; });
-      }};
-
-  // GPU buddy allocators
-  auto& allocators = *as.get();
-
-  // GPU buddy allocator initialization
-  std::call_once(gpu_allocator_flag, [&]() {
-    int gpu_num = platform::GetDeviceCount();
-    allocators.reserve(gpu_num);
+  static BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetCUDADeviceCount();
+    as = new BuddyAllocator*[gpu_num];
     for (int gpu = 0; gpu < gpu_num; gpu++) {
       platform::SetDeviceId(gpu);
-      allocators.emplace_back(new BuddyAllocator(new detail::GPUAllocator,
-                                                 platform::GpuMinChunkSize(),
-                                                 platform::GpuMaxChunkSize()));
+      as[gpu] = new BuddyAllocator(new detail::GPUAllocator,
+                                   platform::GpuMinChunkSize(),
+                                   platform::GpuMaxChunkSize());
     }
     VLOG(3) << "\n\nNOTE: each GPU device use "
             << FLAGS_fraction_of_gpu_memory_to_use * 100 << "% of GPU memory.\n"
             << "You can set environment variable '"
             << platform::kEnvFractionGpuMemoryToUse
             << "' to change the fraction of GPU usage.\n\n";
-  });
-
+  }
   platform::SetDeviceId(gpu_id);
-  return allocators[gpu_id];
+  return as[gpu_id];
 }
 
 template <>
@@ -111,7 +90,7 @@ size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
   return GetGPUBuddyAllocator(place.device)->Used();
 }
 
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index 53cc63a098d0802479e3a371717adb7596c249ed..2444931e26774ae80b916fbb7bd46ff93025d9ed 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -80,7 +80,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 size_t align(size_t size, paddle::platform::GPUPlace place) {
   size += sizeof(paddle::memory::detail::Metadata);
@@ -135,4 +135,4 @@ TEST(BuddyAllocator, GPUMultAlloc) {
   }
 }
 
-#endif  // PADDLE_ONLY_CPU
+#endif
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 21166354937c378dc3f295f9011d034eb24cfc7c..d2d70d8be71208cfa9673f6a6936b1bca16d7426 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -55,6 +55,27 @@ function(op_library TARGET)
         set(pybind_flag 1)
     endif()
 
+    # pool_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
+    endif()
+
+    # pool_with_index_op contains several operators
+    if ("${TARGET}" STREQUAL "pool_with_index_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
+    endif()
+
+    # save_restore_op contains several operators
+    if ("${TARGET}" STREQUAL "save_restore_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
+    endif()
+
     # activation_op contains several operators
     if ("${TARGET}" STREQUAL "activation_op")
         set(pybind_flag 1)
@@ -70,8 +91,9 @@ function(op_library TARGET)
     endif()
 
     # pybind USE_NO_KERNEL_OP
+    # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
     file(READ ${TARGET}.cc TARGET_CONTENT)
-    string(REGEX MATCH "OperatorWithKernel" regex_result "${TARGET_CONTENT}")
+    string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
     string(REPLACE "_op" "" TARGET "${TARGET}")
     if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
         file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(${TARGET});\n")
@@ -97,12 +119,22 @@ set(DEPS_OPS
     recurrent_op
     cond_op
     cross_entropy_op
-    softmax_with_cross_entropy_op)
+    softmax_with_cross_entropy_op
+    sum_op
+    pool_op
+    pool_with_index_op
+    lstm_op)
+
+
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
   DEPS framework_proto tensor net_op)
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
-op_library(cross_entropy_op DEPS cross_entropy_function)
-op_library(softmax_with_cross_entropy_op DEPS cross_entropy_function softmax_function)
+op_library(cross_entropy_op DEPS cross_entropy)
+op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
+op_library(sum_op DEPS net_op)
+op_library(pool_op DEPS pooling)
+op_library(pool_with_index_op DEPS pooling)
+op_library(lstm_op DEPS sequence2batch lstm_compute)
 
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
@@ -115,3 +147,4 @@ cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
+cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc DEPS dynamic_recurrent_op recurrent_op tensor_array)
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index 82010bfb53e58a0836c99c353590f4e32e25ac4a..e0a00ecaf04335800eab9e2e5a03628a2ce2ca8d 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -21,8 +21,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Inference"),
                    "Input(Inference) of AccuracyOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"),
@@ -70,5 +69,8 @@ information, or not. But the output only shares the LoD with input `Inference`.
 
 namespace ops = paddle::operators;
 REGISTER_OP_WITHOUT_GRADIENT(accuracy, ops::AccuracyOp, ops::AccuracyOpMaker);
-REGISTER_OP_CPU_KERNEL(accuracy,
-                       ops::AccuracyKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    accuracy, ops::AccuracyKernel<paddle::platform::CPUPlace, float>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, int>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, double>,
+    ops::AccuracyKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/accuracy_op.cu b/paddle/operators/accuracy_op.cu
index 75e8a989036f0b818687e1fec3e600bb90e86b22..54e6ab99dc8c8ff1afbc636e6595cd67fb64eccf 100644
--- a/paddle/operators/accuracy_op.cu
+++ b/paddle/operators/accuracy_op.cu
@@ -21,9 +21,9 @@ namespace paddle {
 namespace operators {
 using platform::PADDLE_CUDA_NUM_THREADS;
 
-template <int BlockSize>
-__global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
-                                   const int* labeldata, float* accuracy) {
+template <typename T, int BlockSize>
+__global__ void AccuracyCudaKernel(const int N, const int D, const T* Xdata,
+                                   const T* labeldata, float* accuracy) {
   int count = 0;
   __shared__ int total[BlockSize];
 
@@ -47,7 +47,7 @@ __global__ void AccuracyCudaKernel(const int N, const int D, const int* Xdata,
 }
 
 template <typename T>
-class AccuracyOpCUDAKernel : public framework::OpKernel {
+class AccuracyOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -57,8 +57,8 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
     auto* accuracy = ctx.Output<Tensor>("Accuracy");
     // FIXME(typhoonzero): only support indices currently
     // if add support for output values, how to detect the data type?
-    const int* inference_data = inference->data<int>();
-    const int* label_data = label->data<int>();
+    const T* inference_data = inference->data<T>();
+    const T* label_data = label->data<T>();
     float* accuracy_data = accuracy->mutable_data<float>(ctx.GetPlace());
 
     size_t num_samples = inference->dims()[0];
@@ -69,7 +69,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
       return;
     }
 
-    AccuracyCudaKernel<PADDLE_CUDA_NUM_THREADS><<<
+    AccuracyCudaKernel<T, PADDLE_CUDA_NUM_THREADS><<<
         1, PADDLE_CUDA_NUM_THREADS, 0,
         reinterpret_cast<const platform::CUDADeviceContext&>(
             ctx.device_context())
@@ -81,5 +81,7 @@ class AccuracyOpCUDAKernel : public framework::OpKernel {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(accuracy,
-                       paddle::operators::AccuracyOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(accuracy, paddle::operators::AccuracyOpCUDAKernel<float>,
+                       paddle::operators::AccuracyOpCUDAKernel<double>,
+                       paddle::operators::AccuracyOpCUDAKernel<int>,
+                       paddle::operators::AccuracyOpCUDAKernel<int64_t>);
diff --git a/paddle/operators/accuracy_op.h b/paddle/operators/accuracy_op.h
index fe704efe1c979f4fc6a5a37184e51b416f5e517f..12c6b9aac8819caedbc02017cee81b37322bb72a 100644
--- a/paddle/operators/accuracy_op.h
+++ b/paddle/operators/accuracy_op.h
@@ -35,7 +35,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class AccuracyKernel : public framework::OpKernel {
+class AccuracyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* inference = ctx.Input<Tensor>("Inference");
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index f77e1c572e33533ac672e3d476a7e6dad122031f..ee4f9b0ef29cc73907bc09fb6014850cb4e58a67 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -21,8 +21,7 @@ class ActivationOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim("Y", ctx->GetInputDim("X"));
     ctx->ShareLoD("X", /*->*/ "Y");
   }
@@ -32,8 +31,7 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Y"));
   }
 };
@@ -49,6 +47,18 @@ class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class LogSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogSigmoidOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LogSigmoid operator");
+    AddOutput("Y", "Output of LogSigmoid operator");
+    AddComment(
+        "Logsigmoid activation operator, logsigmoid = log (1 / (1 + exp(-x)))");
+  }
+};
+
 class ExpOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ExpOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -69,6 +79,39 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename AttrType>
+class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LeakyReluOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of LeakyRelu operator");
+    AddOutput("Y", "Output of LeakyRelu operator");
+    AddComment(
+        "LeakyRelu activation operator, "
+        "leaky_relu = max(x, alpha * x)");
+    AddAttr<AttrType>("alpha", "The small negative slope")
+        .SetDefault(static_cast<AttrType>(0.02f));
+  }
+};
+
+template <typename AttrType>
+class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftShrinkOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softshrink operator");
+    AddOutput("Y", "Output of Softshrink operator");
+    AddComment(
+        "Softshrink activation operator, "
+        "softshrink = x - lambda, if x > lambda;"
+        " x + lambda, if x < lambda; 0 otherwise");
+    AddAttr<AttrType>("lambda", "non-negative offset")
+        .SetDefault(static_cast<AttrType>(0.5f));
+  }
+};
+
 class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   TanhOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -81,6 +124,35 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class TanhShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  TanhShrinkOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of TanhShrink operator");
+    AddOutput("Y", "Output of TanhShrink operator");
+    AddComment("TanhShrink activation operator, tanhshrink(x) = x - tanh(x)");
+  }
+};
+
+template <typename AttrType>
+class HardShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardShrinkOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardShrink operator");
+    AddOutput("Y", "Output of HardShrink operator");
+    AddComment(
+        "HardShrink activation operator, "
+        "hard_shrink(x) = x if x > lambda"
+        "hard_shrink(x) = x if x < -lambda"
+        "hard_shrink(x) = 0 otherwise");
+    AddAttr<AttrType>("threshold", "The value of threshold for HardShrink")
+        .SetDefault(static_cast<AttrType>(0.5));
+  }
+};
+
 class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   SqrtOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
@@ -132,6 +204,28 @@ class SquareOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftplusOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softplus operator");
+    AddOutput("Y", "Output of Softplus operator");
+    AddComment("Softplus activation operator, softplus(x) = log(1 + exp(x))");
+  }
+};
+
+class SoftsignOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SoftsignOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Softsign operator");
+    AddOutput("Y", "Output of Softsign operator");
+    AddComment("Softsign activation operator, softsign(x) = x / (1 + |x|)");
+  }
+};
+
 template <typename AttrType>
 class BReluOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -163,6 +257,40 @@ class SoftReluOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename AttrType>
+class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ELUOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor) The input of ELU operator, it shouldn't be empty. Input "
+             "is flattened and treated as a 1D array.");
+    AddOutput("Y",
+              "(Tensor) The output of ELU operator. It has the same shape as "
+              "the input.");
+    AddAttr<AttrType>(
+        "alpha", "(float, default 1.0) Alpha value in the elu formulation.")
+        .SetDefault(static_cast<AttrType>(1.));
+    AddComment(R"DOC(
+        ELU activation operator. It applies this element-wise computation on
+        the input: f(x) = max(0, x) + min(0, alpha * (exp(x) - 1)).
+        Check .. _Link: https://arxiv.org/abs/1511.07289 for more details.)DOC");
+  }
+};
+
+template <typename AttrType>
+class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Relu6OpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Relu6 operator");
+    AddOutput("Y", "Output of Relu6 operator");
+    AddComment("Relu6 activation operator, relu6 = min(max(0, x), 6)");
+    AddAttr<AttrType>("threshold", "The threshold value of Relu6")
+        .SetDefault(static_cast<AttrType>(6));
+  }
+};
+
 template <typename AttrType>
 class PowOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
@@ -191,115 +319,139 @@ class STanhOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
+template <typename AttrType>
+class ThresholdedReluOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ThresholdedReluOpMaker(framework::OpProto *proto,
+                         framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of ThresholdedRelu operator");
+    AddOutput("Y", "Output of ThresholdedRelu operator");
+    AddComment(
+        "ThresholdedRelu activation operator, "
+        "thresholded_relu = x for x > threshold, "
+        "thresholded_relu = 0 otherwise.");
+    AddAttr<AttrType>("threshold", "The threshold location of activation")
+        .SetDefault(static_cast<AttrType>(1.0));
+  }
+};
+
+template <typename AttrType>
+class HardSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HardSigmoidOpMaker(framework::OpProto *proto,
+                     framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of HardSigmoid operator");
+    AddOutput("Y", "Output of HardSigmoid operator");
+    AddComment(R"DOC(
+Hard Sigmoid activation operator.
+
+Segment-wise linear approximation of sigmoid[1].
+This is much faster than sigmoid.
+
+hard_sigmoid = max(0, min(1, slope * x + shift))
+
+The slope should be positive. The offset can be either positive or negative.
+The default slope and shift are set from [1].
+It is recommended to use the defaults for this activation.
+
+References:
+  [1] Noisy Activation Functions
+      (https://arxiv.org/abs/1603.00391)
+
+    )DOC");
+    AddAttr<AttrType>("slope", "Slope for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.2));
+    AddAttr<AttrType>("offset", "Offset for linear approximation of sigmoid")
+        .SetDefault(static_cast<AttrType>(0.5));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
 REGISTER_OP(sigmoid, ops::ActivationOp, ops::SigmoidOpMaker, sigmoid_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(sigmoid,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SigmoidFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                            ops::SigmoidGradFunctor<float>>);
+
+REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
+            logsigmoid_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    exp,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::ExpFunctor>);
-REGISTER_OP_CPU_KERNEL(exp_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::ExpGradFunctor>);
 
 REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::ReluFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::ReluGradFunctor<float>>);
 
 REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    tanh,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::TanhFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    tanh_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::TanhGradFunctor<float>>);
+
+REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
+            tanh_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker<float>,
+            softshrink_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    sqrt,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::SqrtFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    sqrt_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                         ops::SqrtGradFunctor<float>>);
 
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    abs,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::AbsFunctor>);
-REGISTER_OP_CPU_KERNEL(abs_grad,
-                       ops::ActivationGradKernel<paddle::platform::CPUPlace,
-                                                 float, ops::AbsGradFunctor>);
 
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
             reciprocal_grad, ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(reciprocal,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::ReciprocalFunctor<float>>);
-REGISTER_OP_CPU_KERNEL(
-    reciprocal_grad,
-    ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                              ops::ReciprocalGradFunctor<float>>);
 
 REGISTER_OP(log, ops::ActivationOp, ops::LogOpMaker, log_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    log,
-    ops::ActivationKernel<paddle::platform::CPUPlace, float, ops::LogFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    log_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                        ops::LogGradFunctor<float>>);
 
 REGISTER_OP(square, ops::ActivationOp, ops::SquareOpMaker, square_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::CPUPlace, float,
-                                             ops::SquareFunctor>);
-REGISTER_OP_CPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::CPUPlace, float,
-                                           ops::SquareGradFunctor<float>>);
+
+REGISTER_OP(softplus, ops::ActivationOp, ops::SoftplusOpMaker, softplus_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(softsign, ops::ActivationOp, ops::SoftsignOpMaker, softsign_grad,
+            ops::ActivationOpGrad);
 
 REGISTER_OP(brelu, ops::ActivationOp, ops::BReluOpMaker<float>, brelu_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(brelu,
-                       ops::BReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(brelu_grad,
-                       ops::BReluGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(leaky_relu, ops::ActivationOp, ops::LeakyReluOpMaker<float>,
+            leaky_relu_grad, ops::ActivationOpGrad);
 
 REGISTER_OP(soft_relu, ops::ActivationOp, ops::SoftReluOpMaker<float>,
             soft_relu_grad, ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(soft_relu,
-                       ops::SoftReluKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(elu, ops::ActivationOp, ops::ELUOpMaker<float>, elu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(relu6, ops::ActivationOp, ops::Relu6OpMaker<float>, relu6_grad,
+            ops::ActivationOpGrad);
 
 REGISTER_OP(pow, ops::ActivationOp, ops::PowOpMaker<float>, pow_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(pow, ops::PowKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(pow_grad,
-                       ops::PowGradKernel<paddle::platform::CPUPlace, float>);
 
 REGISTER_OP(stanh, ops::ActivationOp, ops::STanhOpMaker<float>, stanh_grad,
             ops::ActivationOpGrad);
-REGISTER_OP_CPU_KERNEL(stanh,
-                       ops::STanhKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(stanh_grad,
-                       ops::STanhGradKernel<paddle::platform::CPUPlace, float>);
+
+REGISTER_OP(hard_shrink, ops::ActivationOp, ops::HardShrinkOpMaker<float>,
+            hard_shrink_grad, ops::ActivationOpGrad);
+
+REGISTER_OP(thresholded_relu, ops::ActivationOp,
+            ops::ThresholdedReluOpMaker<float>, thresholded_relu_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(hard_sigmoid, ops::ActivationOp, ops::HardSigmoidOpMaker<float>,
+            hard_sigmoid_grad, ops::ActivationOpGrad);
+
+#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, functor, grad_functor)        \
+  REGISTER_OP_CPU_KERNEL(                                                      \
+      act_type,                                                                \
+      ops::ActivationKernel<paddle::platform::CPUPlace, ops::functor<float>>); \
+  REGISTER_OP_CPU_KERNEL(act_type##_grad,                                      \
+                         ops::ActivationGradKernel<paddle::platform::CPUPlace, \
+                                                   ops::grad_functor<float>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CPU_KERNEL);
diff --git a/paddle/operators/activation_op.cu b/paddle/operators/activation_op.cu
index feed1302b292a546f88fa35457c86aa2cfdaa307..7b7644519d4e9cadcc4ca62ccb599262feffa660 100644
--- a/paddle/operators/activation_op.cu
+++ b/paddle/operators/activation_op.cu
@@ -17,84 +17,12 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(sigmoid,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SigmoidFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    sigmoid_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                            ops::SigmoidGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    exp,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::ExpFunctor>);
-REGISTER_OP_GPU_KERNEL(exp_grad,
-                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
-                                                 float, ops::ExpGradFunctor>);
-REGISTER_OP_GPU_KERNEL(relu,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::ReluFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    relu_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::ReluGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    tanh,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::TanhFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    tanh_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::TanhGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    sqrt,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::SqrtFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    sqrt_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                         ops::SqrtGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    abs,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::AbsFunctor>);
-REGISTER_OP_GPU_KERNEL(abs_grad,
-                       ops::ActivationGradKernel<paddle::platform::GPUPlace,
-                                                 float, ops::AbsGradFunctor>);
-
-REGISTER_OP_GPU_KERNEL(reciprocal,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::ReciprocalFunctor<float>>);
-REGISTER_OP_GPU_KERNEL(
-    reciprocal_grad,
-    ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                              ops::ReciprocalGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(
-    log,
-    ops::ActivationKernel<paddle::platform::GPUPlace, float, ops::LogFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    log_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                        ops::LogGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(square,
-                       ops::ActivationKernel<paddle::platform::GPUPlace, float,
-                                             ops::SquareFunctor>);
-REGISTER_OP_GPU_KERNEL(
-    square_grad, ops::ActivationGradKernel<paddle::platform::GPUPlace, float,
-                                           ops::SquareGradFunctor<float>>);
-
-REGISTER_OP_GPU_KERNEL(brelu,
-                       ops::BReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(brelu_grad,
-                       ops::BReluGradKernel<paddle::platform::GPUPlace, float>);
-
-REGISTER_OP_GPU_KERNEL(soft_relu,
-                       ops::SoftReluKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    soft_relu_grad, ops::SoftReluGradKernel<paddle::platform::GPUPlace, float>);
-
-REGISTER_OP_GPU_KERNEL(pow, ops::PowKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(pow_grad,
-                       ops::PowGradKernel<paddle::platform::GPUPlace, float>);
-
-REGISTER_OP_GPU_KERNEL(stanh,
-                       ops::STanhKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(stanh_grad,
-                       ops::STanhGradKernel<paddle::platform::GPUPlace, float>);
+#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, functor, grad_functor)        \
+  REGISTER_OP_GPU_KERNEL(                                                      \
+      act_type,                                                                \
+      ops::ActivationKernel<paddle::platform::GPUPlace, ops::functor<float>>); \
+  REGISTER_OP_GPU_KERNEL(act_type##_grad,                                      \
+                         ops::ActivationGradKernel<paddle::platform::GPUPlace, \
+                                                   ops::grad_functor<float>>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_GPU_KERNEL);
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 15f8afb4ba45cc989fe7576b82b8bf853b1df7de..4f4eb44fedc0a89cdcf60fb7177014a11eb96048 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -19,9 +19,12 @@
 namespace paddle {
 namespace operators {
 
-template <typename Place, typename T, typename Functor>
-class ActivationKernel : public framework::OpKernel {
+template <typename Place, typename Functor>
+class ActivationKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
+  using T = typename Functor::ELEMENT_TYPE;
+
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
     auto* Y = context.Output<framework::Tensor>("Y");
@@ -31,13 +34,20 @@ class ActivationKernel : public framework::OpKernel {
     auto y = framework::EigenVector<T>::Flatten(*Y);
     auto place = context.GetEigenDevice<Place>();
     Functor functor;
+
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     functor(place, x, y);
   }
 };
 
-template <typename Place, typename T, typename Functor>
-class ActivationGradKernel : public framework::OpKernel {
+template <typename Place, typename Functor>
+class ActivationGradKernel
+    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
  public:
+  using T = typename Functor::ELEMENT_TYPE;
   void Compute(const framework::ExecutionContext& context) const override {
     auto* X = context.Input<framework::Tensor>("X");
     auto* Y = context.Input<framework::Tensor>("Y");
@@ -51,303 +61,618 @@ class ActivationGradKernel : public framework::OpKernel {
     auto dx = framework::EigenVector<T>::Flatten(*dX);
     auto place = context.GetEigenDevice<Place>();
     Functor functor;
+    auto attrs = functor.GetAttrs();
+    for (auto& attr : attrs) {
+      *attr.second = context.Attr<float>(attr.first);
+    }
     functor(place, x, y, dy, dx);
   }
 };
 
+template <typename T>
+struct BaseActivationFunctor {
+  using ELEMENT_TYPE = T;
+
+  using AttrPair = std::vector<std::pair<const char*, float*>>;
+
+  AttrPair GetAttrs() { return AttrPair(); }
+};
+
 // sigmoid(x) = 1 / (1 + exp(-x))
 template <typename T>
-struct SigmoidFunctor {
+struct SigmoidFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
   }
 };
 
 template <typename T>
-struct SigmoidGradFunctor {
+struct SigmoidGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * y * (static_cast<T>(1) - y);
   }
 };
 
+// Originally: logsigmoid(x) = -log (1 + exp(-x))
+// For numerical stability, we can use the log-sum-exp trick:
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// We can rewrite the above equation as:
+// y = -log( exp(0) + exp(-x)) [since exp(0) = 1]
+//   = -log( exp(max(-x, 0) - max(-x, 0)) + exp(-x + max(-x, 0) - max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * exp(-max(-x, 0)) - exp(max(-x, 0)) * exp(-x -
+//           max(-x, 0)))
+//   = -log( exp(max(-x, 0)) * (exp(-max(-x, 0)) + exp(-x - max(-x, 0))))
+//   = -log( exp(max(-x, 0)) - log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))
+//
+// Hence, logsigmoid(x) = - (max(-x, 0) + log(exp(-max(-x, 0))
+// + exp(-x - max(-x, 0))))
+template <typename T>
+struct LogSigmoidFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    y.device(d) = -temp - (((-temp).exp() + (-x - temp).exp()).log());
+  }
+};
+
+// Originally: f' = exp(-x) / (1 + exp(-x))
+// For numerical stability: f' = exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) +
+// exp(-x - max(-x, 0)))
+template <typename T>
+struct LogSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp = (-x).cwiseMax(static_cast<T>(0));  // temp = max(-x, 0)
+    dx.device(d) =
+        dy * ((-x - temp).exp() / ((-temp).exp() + (-x - temp).exp()));
+  }
+};
+
 // exp(x) = e^x
-struct ExpFunctor {
+template <typename T>
+struct ExpFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.exp();
   }
 };
 
-struct ExpGradFunctor {
+template <typename T>
+struct ExpGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * y;
   }
 };
 
 // relu(x) = max(x, 0)
 template <typename T>
-struct ReluFunctor {
+struct ReluFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.cwiseMax(static_cast<T>(0));
   }
 };
 
 template <typename T>
-struct ReluGradFunctor {
+struct ReluGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * (x > static_cast<T>(0)).template cast<T>();
   }
 };
 
 // tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
-struct TanhFunctor {
+template <typename T>
+struct TanhFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.tanh();
   }
 };
 
 template <typename T>
-struct TanhGradFunctor {
+struct TanhGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * (static_cast<T>(1) - y * y);
   }
 };
 
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct TanhShrinkFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x - x.tanh();
+  }
+};
+
+template <typename T>
+struct TanhShrinkGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * (x.tanh() * x.tanh());
+  }
+};
+
+// tanhshrink(x) = x - tanh(x)
+// where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x))
+template <typename T>
+struct HardShrinkFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > threshold).template cast<T>().eval();
+    y.device(d) = x * (temp1 + temp2);
+  }
+};
+
+template <typename T>
+struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = (x < (threshold * -1)).template cast<T>().eval();
+    auto temp2 = (x > threshold).template cast<T>().eval();
+    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+  }
+};
+
+// softshrink(x) = x - lambda, if x > lambda; x + lambda, if x < lambda; 0
+// otherwise
+template <typename T>
+struct SoftShrinkFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp1 = (x > lambda).template cast<T>().eval();
+    auto temp2 = (x < -lambda).template cast<T>().eval();
+    y.device(d) = temp1 * (x - lambda) + temp2 * (x + lambda);
+  }
+};
+
+template <typename T>
+struct SoftShrinkGradFunctor : public BaseActivationFunctor<T> {
+  float lambda;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"lambda", &lambda}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = (x > lambda).template cast<T>().eval();
+    auto temp2 = (x < -lambda).template cast<T>().eval();
+    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+  }
+};
+
 // sqrt(x) = x^(1/2)
-struct SqrtFunctor {
+template <typename T>
+struct SqrtFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.sqrt();
   }
 };
 
 template <typename T>
-struct SqrtGradFunctor {
+struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     const Y y_conj = Eigen::numext::conj(y);
     dx.device(d) = static_cast<T>(0.5) * dy / y_conj;
   }
 };
 
 // abs(x) = |x|
-struct AbsFunctor {
+template <typename T>
+struct AbsFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.abs();
   }
 };
 
-struct AbsGradFunctor {
+template <typename T>
+struct AbsGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * x.sign();
   }
 };
 
 // reciprocal(x) = 1 / x
 template <typename T>
-struct ReciprocalFunctor {
+struct ReciprocalFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = static_cast<T>(1) / x;
   }
 };
 
 template <typename T>
-struct ReciprocalGradFunctor {
+struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * static_cast<T>(-1) * y * y;
   }
 };
 
 // log(x) = natural logarithm of x
-struct LogFunctor {
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.log();
   }
 };
 
 template <typename T>
-struct LogGradFunctor {
+struct LogGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * (static_cast<T>(1) / x);
   }
 };
 
 // square(x) = x^2
-struct SquareFunctor {
+template <typename T>
+struct SquareFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y>
-  void operator()(Device d, X x, Y y) {
+  void operator()(Device d, X x, Y y) const {
     y.device(d) = x.square();
   }
 };
 
 template <typename T>
-struct SquareGradFunctor {
+struct SquareGradFunctor : public BaseActivationFunctor<T> {
   template <typename Device, typename X, typename Y, typename dY, typename dX>
-  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     dx.device(d) = dy * static_cast<T>(2) * x;
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class BReluKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
-    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct BReluFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+
+  // NOTE: Explicit hides the `BaseActivationFunctor<T>::GetAttrs`
+  // not polymorphism for speed.
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = x.cwiseMax(t_min).cwiseMin(t_max);
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.cwiseMax(t_min).cwiseMin(t_max);
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class BReluGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto t_min = static_cast<T>(context.Attr<AttrType>("t_min"));
-    auto t_max = static_cast<T>(context.Attr<AttrType>("t_max"));
-    dX->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct BReluGradFunctor : public BaseActivationFunctor<T> {
+  float t_min;
+  float t_max;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"t_min", &t_min}, {"t_max", &t_max}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+  }
+};
 
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+// relu6(x) = min(max(0, x), 6)
+template <typename T>
+struct Relu6Functor : public BaseActivationFunctor<T> {
+  float threshold;
 
-    dx.device(place) = dy * ((x > t_min) * (x < t_max)).template cast<T>();
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.cwiseMax(static_cast<T>(0)).cwiseMin(threshold);
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class SoftReluKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct Relu6GradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy * ((x > static_cast<T>(0)) * (x < threshold)).template cast<T>();
+  }
+};
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold).eval();
-    y.device(place) = (static_cast<T>(1) + temp.exp()).log();
+// softplus(x) = log(1 + exp(x))
+// When x is a very large positive number, exp(x) may explode to inf,
+// Using trick below for numerical stability
+// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/
+// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    y.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log());
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class SoftReluGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto threshold = static_cast<T>(context.Attr<AttrType>("threshold"));
-    dX->mutable_data<T>(context.GetPlace());
+// d(softplus(x))/dx = exp(x) / (1 + exp(x))
+// For numerical stability:
+// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) +
+// exp(x - max(x, 0)))
+template <typename T>
+struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    auto temp = x.cwiseMax(static_cast<T>(0));  // temp = max(x, 0)
+    dx.device(d) = dy * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp()));
+  }
+};
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+// softsign(x) = x / (1 + |x|)
+template <typename T>
+struct SoftsignFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) {
+    y.device(d) = x / (static_cast<T>(1) + x.abs());
+  }
+};
+
+// d(softsign(x))/dx = 1 / (1 + |x|)^2
+// Taken from https://en.wikipedia.org/wiki/Activation_function
+template <typename T>
+struct SoftsignGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) {
+    dx.device(d) =
+        dy * (static_cast<T>(1) / (static_cast<T>(1) + x.abs()).square());
+  }
+};
+
+template <typename T>
+struct SoftReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = x.cwiseMax(-threshold).cwiseMin(threshold);
+    y.device(d) = (static_cast<T>(1) + temp.exp()).log();
+  }
+};
+
+template <typename T>
+struct SoftReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     auto temp = ((x > -threshold) * (x < threshold)).template cast<T>().eval();
-    dx.device(place) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
+    dx.device(d) = dy * (static_cast<T>(1) - (-y).exp()) * temp;
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class PowKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct LeakyReluFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = x.pow(factor);
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.cwiseMax(alpha * x);
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class PowGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto factor = static_cast<T>(context.Attr<AttrType>("factor"));
-    dX->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    auto temp1 = alpha * (x < static_cast<T>(0)).template cast<T>().eval();
+    auto temp2 = (x >= static_cast<T>(0)).template cast<T>().eval();
+    dx.device(d) = dy * (temp1 + temp2).template cast<T>();
+  }
+};
 
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+template <typename T>
+struct ELUFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
 
-    dx.device(place) = dy * factor * x.pow(factor - static_cast<T>(1));
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) =
+        x.cwiseMax(static_cast<T>(0)) +
+        (alpha * (x.exp() - static_cast<T>(1))).cwiseMin(static_cast<T>(0));
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class STanhKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Output<framework::Tensor>("Y");
-    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
-    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
-    Y->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct ELUGradFunctor : public BaseActivationFunctor<T> {
+  float alpha;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"alpha", &alpha}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy * (x > static_cast<T>(0)).template cast<T>() +
+        dy * (y + alpha) * (x < static_cast<T>(0)).template cast<T>();
+  }
+};
 
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto y = framework::EigenVector<T>::Flatten(*Y);
-    auto place = context.GetEigenDevice<Place>();
-    y.device(place) = scale_b * (scale_a * x).tanh();
+template <typename T>
+struct PowFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.pow(factor);
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class STanhGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* dY = context.Input<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dX = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto scale_a = static_cast<T>(context.Attr<AttrType>("scale_a"));
-    auto scale_b = static_cast<T>(context.Attr<AttrType>("scale_b"));
-    dX->mutable_data<T>(context.GetPlace());
+template <typename T>
+struct PowGradFunctor : public BaseActivationFunctor<T> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * factor * x.pow(factor - static_cast<T>(1));
+  }
+};
 
-    auto dy = framework::EigenVector<T>::Flatten(*dY);
-    auto x = framework::EigenVector<T>::Flatten(*X);
-    auto dx = framework::EigenVector<T>::Flatten(*dX);
-    auto place = context.GetEigenDevice<Place>();
+template <typename T>
+struct STanhFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = scale_b * (scale_a * x).tanh();
+  }
+};
 
+template <typename T>
+struct STanhGradFunctor : public BaseActivationFunctor<T> {
+  float scale_a;
+  float scale_b;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"scale_a", &scale_a}, {"scale_b", &scale_b}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
     auto temp = (scale_a * x).tanh() * (scale_a * x).tanh();
-    dx.device(place) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+    dx.device(d) = dy * scale_a * scale_b * (static_cast<T>(1) - temp);
+  }
+};
+
+template <typename T>
+struct ThresholdedReluFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = (x > static_cast<T>(threshold)).template cast<T>() * x;
+  }
+};
+
+template <typename T>
+struct ThresholdedReluGradFunctor : public BaseActivationFunctor<T> {
+  float threshold;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"threshold", &threshold}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = dy * (x > static_cast<T>(threshold)).template cast<T>();
+  }
+};
+
+template <typename T>
+struct HardSigmoidFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    auto temp = x * static_cast<T>(slope) + static_cast<T>(offset);
+    y.device(d) = temp.cwiseMax(static_cast<T>(0)).cwiseMin(static_cast<T>(1));
+  }
+};
+
+template <typename T>
+struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
+  float slope;
+  float offset;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"slope", &slope}, {"offset", &offset}};
+  }
+
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) =
+        dy *
+        ((y > static_cast<T>(0)) * (y < static_cast<T>(1))).template cast<T>() *
+        static_cast<T>(slope);
   }
 };
 
 }  // namespace operators
 }  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                             \
+  __macro(sigmoid, SigmoidFunctor, SigmoidGradFunctor);              \
+  __macro(logsigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor);     \
+  __macro(exp, ExpFunctor, ExpGradFunctor);                          \
+  __macro(relu, ReluFunctor, ReluGradFunctor);                       \
+  __macro(tanh, TanhFunctor, TanhGradFunctor);                       \
+  __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
+  __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
+  __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
+  __macro(log, LogFunctor, LogGradFunctor);                          \
+  __macro(square, SquareFunctor, SquareGradFunctor);                 \
+  __macro(brelu, BReluFunctor, BReluGradFunctor);                    \
+  __macro(soft_relu, SoftReluFunctor, SoftReluGradFunctor);          \
+  __macro(pow, PowFunctor, PowGradFunctor);                          \
+  __macro(stanh, STanhFunctor, STanhGradFunctor);                    \
+  __macro(softplus, SoftplusFunctor, SoftplusGradFunctor);           \
+  __macro(softsign, SoftsignFunctor, SoftsignGradFunctor);           \
+  __macro(relu6, Relu6Functor, Relu6GradFunctor);                    \
+  __macro(leaky_relu, LeakyReluFunctor, LeakyReluGradFunctor);       \
+  __macro(tanh_shrink, TanhShrinkFunctor, TanhShrinkGradFunctor);    \
+  __macro(elu, ELUFunctor, ELUGradFunctor);                          \
+  __macro(hard_shrink, HardShrinkFunctor, HardShrinkGradFunctor);    \
+  __macro(hard_sigmoid, HardSigmoidFunctor, HardSigmoidGradFunctor); \
+  __macro(thresholded_relu, ThresholdedReluFunctor, ThresholdedReluGradFunctor);
diff --git a/paddle/operators/adadelta_op.cc b/paddle/operators/adadelta_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24e419b532d97bc16ab96dad418d6e73c03f30a0
--- /dev/null
+++ b/paddle/operators/adadelta_op.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adadelta_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdadeltaOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredGrad"),
+                   "Input(AvgSquaredGrad) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("AvgSquaredUpdate"),
+                   "Input(AvgSquaredUpdate) of AdadeltaOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredGradOut"),
+        "Output(AvgSquaredGradOut) of AdadeltaOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasOutput("AvgSquaredUpdateOut"),
+        "Output(AvgSquaredUpdateOut) of AdadeltaOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "param and grad input of AdadeltaOp should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredGrad"),
+                      "Param and AvgSquaredGrad input of AdadeltaOp "
+                      "should have same dimension");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("AvgSquaredUpdate"),
+                      "Param and AvgSquaredUpdate input of AdadeltaOp "
+                      "should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredGradOut", param_dim);
+    ctx->SetOutputDim("AvgSquaredUpdateOut", param_dim);
+  }
+};
+
+class AdadeltaOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdadeltaOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("AvgSquaredGrad",
+             "(Tensor) Input expectation of squared gradient");
+    AddInput("AvgSquaredUpdate",
+             "(Tensor) Input expectation of squared parameter updates");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("AvgSquaredGradOut",
+              "(Tensor) Output expectation of squared gradient");
+    AddOutput("AvgSquaredUpdateOut",
+              "(Tensor) Output expectation of squared parameter updates");
+
+    AddAttr<float>("rho",
+                   "(float, default 0.95) Exponential decay rate "
+                   "for squared gradients.")
+        .SetDefault(0.95f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) Constant for "
+                   "numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+Adadelta Updates Operator.
+
+This implements the Adadelta optimizer[1]. Adadelta is a per-dimension
+adaptive learning rate method for gradient descent.
+
+Adadelta updates:
+
+avg_squared_grad_out = rho * avg_squared_grad + (1 - rho) * grad * grad
+param_update =  - sqrt((avg_squared_update + epsilon) /
+                       (avg_squared_grad_out + epsilon)) * grad
+avg_squared_update_out = rho * avg_squared_update + (1 - rho) * param_update**2
+param_out = param + param_update
+
+References:
+  [1] ADADELTA: An Adaptive Learning Rate Method
+      https://arxiv.org/abs/1212.5701
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/adadelta_op.cu b/paddle/operators/adadelta_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3af1c8c8e9861138a33b3156818f704c3b20363f
--- /dev/null
+++ b/paddle/operators/adadelta_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adadelta_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adadelta_op.h b/paddle/operators/adadelta_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..d29e15c43583bd447fbacb548a326f303f7d1463
--- /dev/null
+++ b/paddle/operators/adadelta_op.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdadeltaOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto avg_squared_grad_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredGradOut");
+    auto avg_squared_update_out_tensor =
+        ctx.Output<framework::Tensor>("AvgSquaredUpdateOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
+    avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float rho = ctx.Attr<float>("rho");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    // Squared gradient accumulator
+    auto avg_squared_grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredGrad"));
+    // Squared updates accumulator
+    auto avg_squared_update = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("AvgSquaredUpdate"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto avg_squared_grad_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_grad_out_tensor);
+    auto avg_squared_update_out =
+        framework::EigenVector<T>::Flatten(*avg_squared_update_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    avg_squared_grad_out.device(place) =
+        rho * avg_squared_grad + (1 - rho) * grad.square();
+    auto update =
+        -((avg_squared_update + epsilon) / (avg_squared_grad_out + epsilon))
+             .sqrt() *
+        grad;
+    avg_squared_update_out.device(place) =
+        rho * avg_squared_update + (1 - rho) * update.square();
+    param_out.device(place) = param + update;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adagrad_op.cc b/paddle/operators/adagrad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..bc081f87dcab0dcd8ef329dcb1f66b627c82b4a2
--- /dev/null
+++ b/paddle/operators/adagrad_op.cc
@@ -0,0 +1,92 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdagradOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdagradOp should have the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class AdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdagradOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Adaptive Gradient Algorithm (Adagrad).
+
+moment_out = moment + grad * grad
+param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+
+The original paper(http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf)
+does not have the epsilon attribute. It is added here for numerical stability 
+by avoiding division by zero.
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adagrad, ops::AdagradOp, ops::AdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(adagrad,
+                       ops::AdagradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/adagrad_op.cu b/paddle/operators/adagrad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a5b7951121360f78612f9008a522235104708112
--- /dev/null
+++ b/paddle/operators/adagrad_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(adagrad,
+                       ops::AdagradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adagrad_op.h b/paddle/operators/adagrad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5d8f751d3527f89b96d4274328ba0bb5f6efa44
--- /dev/null
+++ b/paddle/operators/adagrad_op.h
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = moment + grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adam_op.cc b/paddle/operators/adam_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3572de06bd60f7979e3bfbf39856b04942ce81c0
--- /dev/null
+++ b/paddle/operators/adam_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                   "Input(Moment1) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                   "Input(Moment2) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                   "Input(Beta2Pow) of AdamOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                   "Output(Moment1Out) of AdamOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                   "Output(Moment2Out) of AdamOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment1"),
+        "Param and Moment input of AdamOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment2"),
+        "Param and InfNorm input of AdamOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("Moment1Out", param_dims);
+    ctx->SetOutputDim("Moment2Out", param_dims);
+  }
+};
+
+class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment1", "(Tensor) Input first moment");
+    AddInput("Moment2", "(Tensor) Input second moment");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("Moment1Out", "(Tensor) Output first moment");
+    AddOutput("Moment2Out", "(Tensor) Output second moment");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "first moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the "
+                   "second moment estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+
+    AddComment(R"DOC(
+Adam Updates Operator.
+
+This implements the Adam optimizer from Section 2 of the Adam
+paper[1]. Adam is a first-order gradient-based optimization
+method based on adaptive estimates of lower-order moments.
+
+Adam updates:
+
+moment1_out = beta1 * moment1 + (1 − beta1) * grad
+moment2_out = beta2 * moment2 + (1 − beta2) * grad * grad
+learning_rate_t = learning_rate_t *
+                  sqrt(1 - beta2_pow) / (1 - beta1_pow)
+param_out = param - learning_rate_t * moment1/ (sqrt(moment2) + epsilon)
+
+References:
+  [1] Adam: A Method for Stochastic Optimization
+      (https://arxiv.org/abs/1412.6980)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
+REGISTER_OP_CPU_KERNEL(adam,
+                       ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/adam_op.cu b/paddle/operators/adam_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a3def912e540454275350209435eb01ae2151331
--- /dev/null
+++ b/paddle/operators/adam_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/adam_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(adam,
+                       ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adam_op.h b/paddle/operators/adam_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..45938006db1231a7a134964d729df6ca114d4dbe
--- /dev/null
+++ b/paddle/operators/adam_op.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdamOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment1_out_tensor = ctx.Output<framework::Tensor>("Moment1Out");
+    auto moment2_out_tensor = ctx.Output<framework::Tensor>("Moment2Out");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float beta1 = ctx.Attr<float>("beta1");
+    float beta2 = ctx.Attr<float>("beta2");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment1 = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment1"));
+    auto moment2 = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment2"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto beta2_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta2Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment1_out = framework::EigenVector<T>::Flatten(*moment1_out_tensor);
+    auto moment2_out = framework::EigenVector<T>::Flatten(*moment2_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment1_out.device(place) = beta1 * moment1 + (1 - beta1) * grad;
+    moment2_out.device(place) = beta2 * moment2 + (1 - beta2) * grad.square();
+
+    // All of these are tensors of 1 element
+    auto lr_t = lr * (1 - beta2_pow).sqrt() / (1 - beta1_pow);
+    // Eigen does not support automatic broadcast
+    // Get dimensions of moment vector to broadcast lr_t
+    Eigen::DSizes<int, 1> m_dsize(moment1_out_tensor->numel());
+    param_out.device(place) =
+        param -
+        lr_t.broadcast(m_dsize) *
+            (moment1_out / (moment2_out.sqrt() + epsilon));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/adamax_op.cc b/paddle/operators/adamax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ff2565774115571166712b03c8990e5bf8de12a5
--- /dev/null
+++ b/paddle/operators/adamax_op.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/adamax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AdamaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("InfNorm"),
+                   "Input(InfNorm) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                   "Input(Beta1Pow) of AdamaxOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of AdamaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("InfNormOut"),
+                   "Output(InfNormOut) of AdamaxOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 dimension");
+    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                      "Beta1 power accumulator should have 1 dimension");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Moment"),
+        "Param and Moment input of AdamaxOp should have same dimension");
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("InfNorm"),
+        "Param and InfNorm input of AdamaxOp should have same dimension");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+    ctx->SetOutputDim("InfNormOut", param_dims);
+  }
+};
+
+class AdamaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AdamaxOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+    AddInput("Moment", "(Tensor) First moment");
+    AddInput("InfNorm",
+             "(Tensor) "
+             "Input exponentially weighted infinity norm");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output first moment");
+    AddOutput("InfNormOut",
+              "(Tensor) "
+              "Output exponentially weighted infinity norm");
+
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) "
+                   "Exponential decay rate for the "
+                   "1st moment estimates.")
+        .SetDefault(0.9f);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) "
+                   "exponential decay rate for the weighted "
+                   "infinity norm estimates.")
+        .SetDefault(0.999f);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-8) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-8f);
+    AddComment(R"DOC(
+Adamax Updates Operator.
+
+This implements the Adamax optimizer from Section 7 of the Adam
+paper[1]. Adamax is a variant of the
+Adam algorithm based on the infinity norm.
+
+Adamax updates:
+
+moment_out = beta1 * moment + (1 - beta1) * grad
+inf_norm_out = max(beta2 * inf_norm + epsilon, abs(grad))
+learning_rate_t = learning_rate/(1 - beta1_pow)
+param_out = param - learning_rate_t * moment_out/inf_norm_out
+
+The original paper does not have an epsilon attribute.
+However, it is added here for numerical stability
+by preventing divide by 0.
+
+References:
+  [1] Adam: A Method for Stochastic Optimization
+      (https://arxiv.org/abs/1412.6980)
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
+REGISTER_OP_CPU_KERNEL(adamax,
+                       ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/adamax_op.cu
similarity index 80%
rename from paddle/operators/add_op.cu
rename to paddle/operators/adamax_op.cu
index d9c6d20a6c320b59e57ed25da3dd8b093833f8c7..fee3b6fc6b656917d79b84f48da8e63be7683890 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/adamax_op.cu
@@ -12,7 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/add_op.h"
+#define EIGEN_USE_GPU
+#include "paddle/operators/adamax_op.h"
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(add, ops::AddKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(adamax,
+                       ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/adamax_op.h b/paddle/operators/adamax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c99832ec08e9c1d9b5458c467d5238f9b1b3c37
--- /dev/null
+++ b/paddle/operators/adamax_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AdamaxOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+    auto inf_norm_out_tensor = ctx.Output<framework::Tensor>("InfNormOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+    inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float beta1 = ctx.Attr<float>("beta1");
+    float beta2 = ctx.Attr<float>("beta2");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto inf_norm = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("InfNorm"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+    auto beta1_pow = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Beta1Pow"));
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto inf_norm_out =
+        framework::EigenVector<T>::Flatten(*inf_norm_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = beta1 * moment + (1 - beta1) * grad;
+    inf_norm_out.device(place) =
+        grad.abs().cwiseMax((beta2 * inf_norm) + epsilon);
+    auto lr_t = lr / (1 - beta1_pow);
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr_t.broadcast(m_dsize) * (moment_out / inf_norm_out);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
deleted file mode 100644
index 3914d1323083ede6a7ea07e7b4ef76b9e4afd26d..0000000000000000000000000000000000000000
--- a/paddle/operators/add_op.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/operators/add_op.h"
-
-namespace paddle {
-namespace operators {
-
-class AddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of AddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of AddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of AddOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      "Two input of Add Op's dimension must be same.");
-    ctx->SetOutputDim("Out", x_dims);
-  }
-};
-
-class AddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  AddOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The first input of add op");
-    AddInput("Y", "The second input of add op");
-    AddOutput("Out", "The output of add op");
-    AddComment(R"DOC(
-Two Element Add Operator.
-
-The equation is: Out = X + Y
-)DOC");
-  }
-};
-
-class AddOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {}
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(add, ops::AddOp, ops::AddOpMaker, add_grad, ops::AddOpGrad);
-
-REGISTER_OP_CPU_KERNEL(add, ops::AddKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f7dc990f0db8ae4891ff068fb97899e6d01478da
--- /dev/null
+++ b/paddle/operators/batch_norm_op.cc
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/batch_norm_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+class BatchNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Bias"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Mean"), "");
+    PADDLE_ENFORCE(ctx->HasInput("Variance"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("Y"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanOut"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("VarianceOut"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedMean"), "");
+    PADDLE_ENFORCE(ctx->HasOutput("SavedVariance"), "");
+
+    // make sure Mean/MeanOut and Variance/VarianceOut share memory in Python
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Mean")[0], ctx->Outputs("MeanOut")[0],
+                      "Mean and MeanOut should share the same memory");
+    PADDLE_ENFORCE_EQ(ctx->Inputs("Variance")[0],
+                      ctx->Outputs("VarianceOut")[0],
+                      "Variance and VarianceOut should share the same memory");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const TensorFormat tensor_format =
+        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const int C =
+        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+                                             : x_dims[x_dims.size() - 1]);
+
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
+    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], C);
+
+    ctx->SetOutputDim("Y", x_dims);
+    ctx->SetOutputDim("MeanOut", {C});
+    ctx->SetOutputDim("VarianceOut", {C});
+    ctx->SetOutputDim("SavedMean", {C});
+    ctx->SetOutputDim("SavedVariance", {C});
+  }
+};
+
+class BatchNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  BatchNormOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<bool>("is_test", "").SetDefault(false);
+    AddAttr<float>("momentum", "").SetDefault(0.9);
+    AddAttr<float>("epsilon", "").SetDefault(1e-5);
+    AddAttr<std::string>("tensor_format", "").SetDefault("NCHW");
+    AddInput("X", "The input tensor");
+    AddInput("Scale",
+             "Scale is a 1-dimensional tensor of size C "
+             "to be applied to the output");
+    AddInput("Bias",
+             "Bias is a 1-dimensional tensor of size C "
+             "to be applied to the output");
+    AddInput("Mean",
+             "The global mean (for training) or the "
+             "estimated mean (for testing)");
+    AddInput("Variance",
+             "The global variance (for training) "
+             "or the estimated Variance (for testing)");
+    AddOutput("Y", "result after normalization");
+    AddOutput("MeanOut",
+              "Share memory with Mean. "
+              "Store the global mean when training");
+    AddOutput("VarianceOut",
+              "Share memory with Variance. "
+              "Store the global Variance when training");
+    AddOutput("SavedMean",
+              "Mean of the current mini batch, "
+              "will apply to output when training");
+    AddOutput("SavedVariance",
+              "Variance of the current mini batch, "
+              "will apply to output when training");
+    AddComment(R"DOC(
+https://arxiv.org/pdf/1502.03167.pdf
+
+NHWC `[batch, in_height, in_width, in_channels]`
+NCHW `[batch, in_channels, in_height, in_width]`
+
+)DOC");
+  }
+};
+
+template <typename T>
+class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const float epsilon = ctx.Attr<float>("epsilon");
+    const float momentum = ctx.Attr<float>("momentum");
+    const bool is_test = ctx.Attr<bool>("is_test");
+    const std::string tensor_format_str =
+        ctx.Attr<std::string>("tensor_format");
+    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto &x_dims = x->dims();
+
+    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+                   "The Input dim size should be between 3 and 5");
+    const int N = x_dims[0];
+    const int C =
+        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+                                             : x_dims[x_dims.size() - 1]);
+    const int sample_size = x->numel() / N / C;
+
+    auto *y = ctx.Output<Tensor>("Y");
+    auto *mean_out = ctx.Output<Tensor>("MeanOut");
+    auto *variance_out = ctx.Output<Tensor>("VarianceOut");
+    auto *saved_mean = ctx.Output<Tensor>("SavedMean");
+    auto *saved_variance = ctx.Output<Tensor>("SavedVariance");
+
+    // alloc memory
+    y->mutable_data<T>(ctx.GetPlace());
+    mean_out->mutable_data<T>(ctx.GetPlace());
+    variance_out->mutable_data<T>(ctx.GetPlace());
+    saved_mean->mutable_data<T>(ctx.GetPlace());
+    saved_variance->mutable_data<T>(ctx.GetPlace());
+
+    if (!is_test) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(
+          saved_mean->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          saved_variance->mutable_data<T>(ctx.GetPlace()), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      switch (tensor_format) {
+        case TensorFormat::NCHW: {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        case TensorFormat::NHWC: {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
+          for (int i = 0; i < N * sample_size; ++i) {
+            saved_mean_e += x_arr.col(i);
+          }
+          saved_mean_e /= N * sample_size;
+          for (int i = 0; i < N * sample_size; ++i) {
+            saved_variance_e +=
+                (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e);
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+      }
+
+      EigenVectorArrayMap<T> running_mean_arr(
+          mean_out->mutable_data<T>(ctx.GetPlace()), C);
+      EigenVectorArrayMap<T> running_var_arr(
+          variance_out->mutable_data<T>(ctx.GetPlace()), C);
+      running_mean_arr =
+          running_mean_arr * momentum + saved_mean_e * (1. - momentum);
+      running_var_arr =
+          running_var_arr * momentum + saved_variance_e * (1. - momentum);
+    }
+
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (is_test) {
+      ConstEigenVectorArrayMap<T> var_arr(
+          ctx.Input<Tensor>("Variance")->data<T>(), C);
+      inv_std = (var_arr + epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          ctx.Output<Tensor>("SavedVariance")->data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+    ConstEigenVectorArrayMap<T> mean_arr(
+        is_test ? ctx.Input<Tensor>("Mean")->data<T>()
+                : ctx.Output<Tensor>("SavedMean")->data<T>(),
+        C);
+
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *bias = ctx.Input<Tensor>("Bias");
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(bias->data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+
+    switch (tensor_format) {
+      case TensorFormat::NCHW: {
+        EigenArrayMap<T> y_arr(y->mutable_data<T>(ctx.GetPlace()), sample_size,
+                               N * C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      case TensorFormat::NHWC: {
+        EigenArrayMap<T>(y->mutable_data<T>(ctx.GetPlace()), C,
+                         N * sample_size) =
+            (ConstEigenArrayMap<T>(x->data<T>(), C, N * sample_size).colwise() *
+             new_scale)
+                .colwise() +
+            new_bias;
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %d", tensor_format);
+    }
+  }
+};
+
+class BatchNormGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // check input
+    PADDLE_ENFORCE(ctx->HasInput("X"));
+    PADDLE_ENFORCE(ctx->HasInput("Scale"), "");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedMean"), "");
+    PADDLE_ENFORCE(ctx->HasInput("SavedVariance"), "");
+
+    // check output
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Scale")), "");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")), "");
+
+    const auto x_dims = ctx->GetInputDim("X");
+    const TensorFormat tensor_format =
+        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    const int C =
+        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+                                             : x_dims[x_dims.size() - 1]);
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->SetOutputDim(framework::GradVarName("Scale"), {C});
+    ctx->SetOutputDim(framework::GradVarName("Bias"), {C});
+  }
+};
+
+template <typename T>
+class BatchNormGradKernel<platform::CPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const auto *x = ctx.Input<Tensor>("X");
+    const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    const auto *scale = ctx.Input<Tensor>("Scale");
+    const auto *saved_mean = ctx.Input<Tensor>("SavedMean");
+    // SavedVariance have been reverted in forward operator
+    const auto *saved_inv_variance = ctx.Input<Tensor>("SavedVariance");
+    const std::string tensor_format_str =
+        ctx.Attr<std::string>("tensor_format");
+    const TensorFormat tensor_format = StringToTensorFormat(tensor_format_str);
+
+    // Get the size for each dimension.
+    // NCHW [batch_size, in_channels, in_height, in_width]
+    const auto &x_dims = x->dims();
+    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+                   "The Input dim size should be between 3 and 5");
+    const int N = x_dims[0];
+    const int C =
+        (tensor_format == TensorFormat::NCHW ? x_dims[1]
+                                             : x_dims[x_dims.size() - 1]);
+    const int sample_size = x->numel() / N / C;
+
+    ConstEigenVectorArrayMap<T> scale_arr(scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> mean_arr(saved_mean->data<T>(), C);
+    ConstEigenVectorArrayMap<T> inv_var_arr(saved_inv_variance->data<T>(), C);
+
+    // init output
+    auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
+    auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
+
+    d_x->mutable_data<T>(ctx.GetPlace());
+    d_scale->mutable_data<T>(ctx.GetPlace());
+    d_bias->mutable_data<T>(ctx.GetPlace());
+
+    // d_bias = np.sum(d_y, axis=0)
+    // d_scale = np.sum((X - mean) / inv_std * dy, axis=0)
+    // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0)
+    //   - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0))
+
+    EigenVectorArrayMap<T> d_bias_arr(d_bias->mutable_data<T>(ctx.GetPlace()),
+                                      C);
+    EigenVectorArrayMap<T> d_scale_arr(d_scale->mutable_data<T>(ctx.GetPlace()),
+                                       C);
+
+    d_bias_arr.setZero();
+    d_scale_arr.setZero();
+
+    const auto scale_inv_var_nhw = scale_arr * inv_var_arr / (N * sample_size);
+
+    switch (tensor_format) {
+      case TensorFormat::NCHW: {
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), sample_size, N * C);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()),
+                                 sample_size, N * C);
+        d_x_arr.setZero();
+
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          d_bias_arr(c) += d_y_arr.col(nc).sum();
+          d_scale_arr(c) +=
+              ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc))
+                  .sum();
+        }
+        for (int nc = 0; nc < N * C; ++nc) {
+          int c = nc % C;
+          d_x_arr.col(nc) +=
+              scale_inv_var_nhw(c) *
+              (d_y_arr.col(nc) * N * sample_size - d_bias_arr(c) -
+               (x_arr.col(nc) - mean_arr[c]) * d_scale_arr(c) * inv_var_arr(c));
+        }
+        break;
+      }
+      case TensorFormat::NHWC: {
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), C, N * sample_size);
+        ConstEigenArrayMap<T> d_y_arr(d_y->data<T>(), C, N * sample_size);
+        EigenArrayMap<T> d_x_arr(d_x->mutable_data<T>(ctx.GetPlace()), C,
+                                 N * sample_size);
+        d_x_arr.setZero();
+
+        const auto d_y_row_sum = d_y_arr.rowwise().sum();
+        const auto x_minus_mean = x_arr.colwise() - mean_arr;
+        const auto d_y_mul_x_minus_mean_row_sum =
+            (d_y_arr * x_minus_mean).rowwise().sum();
+        const auto inv_var_sqr = inv_var_arr * inv_var_arr;
+        for (int nhw = 0; nhw < N * sample_size; ++nhw) {
+          d_bias_arr += d_y_arr.col(nhw);
+          d_scale_arr +=
+              (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw);
+          d_x_arr.col(nhw) +=
+              scale_inv_var_nhw *
+              (d_y_arr.col(nhw) * N * sample_size - d_y_row_sum -
+               x_minus_mean.col(nhw) * inv_var_sqr *
+                   d_y_mul_x_minus_mean_row_sum);
+        }
+        break;
+      }
+      default:
+        PADDLE_THROW("Unknown storage order: %s", tensor_format_str);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(batch_norm, ops::BatchNormOp, ops::BatchNormOpMaker,
+            batch_norm_grad, ops::BatchNormGradOp);
+REGISTER_OP_CPU_KERNEL(batch_norm,
+                       ops::BatchNormKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    batch_norm_grad,
+    ops::BatchNormGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/batch_norm_op.h
similarity index 52%
rename from paddle/operators/add_op.h
rename to paddle/operators/batch_norm_op.h
index a7307b6818aa3d10ff215d06281e2b53196fd101..4e80134a1acf3b4d66154453dd0ed709133d1c7c 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/batch_norm_op.h
@@ -19,29 +19,31 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+enum TensorFormat {
+  NHWC = 0,
+  NCHW = 1,
+};
+
+inline TensorFormat StringToTensorFormat(const std::string& str) {
+  if (str == "NHWC" || str == "nhwc") {
+    return TensorFormat::NHWC;
+  } else if (str == "NCHW" || str == "nchw") {
+    return TensorFormat::NCHW;
+  } else {
+    PADDLE_THROW("Unknown storage order string: %s", str);
+  }
+}
 
 template <typename Place, typename T>
-class AddKernel : public framework::OpKernel {
+class BatchNormKernel : public framework::OpKernel<T> {
  public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input0 = context.Input<Tensor>("X");
-    auto* input1 = context.Input<Tensor>("Y");
-    auto* output = context.Output<Tensor>("Out");
-
-    output->mutable_data<T>(context.GetPlace());
-
-    auto X = EigenVector<T>::Flatten(*input0);
-    auto Y = EigenVector<T>::Flatten(*input1);
-    auto Z = EigenVector<T>::Flatten(*output);
-
-    auto place = context.GetEigenDevice<Place>();
+  void Compute(const framework::ExecutionContext& ctx) const override;
+};
 
-    Z.device(place) = X + Y;
-  }
+template <typename Place, typename T>
+class BatchNormGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/batch_norm_op.md b/paddle/operators/batch_norm_op.md
new file mode 100644
index 0000000000000000000000000000000000000000..80948adf2b9047a9685dbdd90b2296b5a955f9c1
--- /dev/null
+++ b/paddle/operators/batch_norm_op.md
@@ -0,0 +1,134 @@
+# Batch Normalization
+
+## What is batch normalization
+
+Batch normalization is a frequently-used method in deep network training. It adjusts the mean and variance of a layer's output, and make the data distribution easier for next layer's training. 
+
+The principle of batch normalization can be summarized into a simple function:
+
+```
+y = (x - E[x]) / STD[x]) * scale + bias
+```
+
+`x` is a batch of output data of a certain layer. `E[x]` and `STD[x]` is the mean and standard deviation of `x`, respectively。 `scale` and `bias` are two trainable parameters. The training of batch normalization layer equals to the learning of best values of `scale` and `bias`.
+
+In our design, we use a single operator(`batch_norm_op`) to implement the whole batch normalization in C++, and wrap it as a layer in Python.
+
+## Differences with normal operators
+
+`batch_norm_op` is a single operator. However, there are a few differences between `BatchNormOp` and normal operators, which we shall take into consideration in our design.
+
+1. `batch_norm_op` shall behave differently in training and inferencing. For example, during inferencing, there is no batch data and it's impossible to compute `E[x]` and `STD[x]`, so we have to use an `estimated_mean` and an `estimated_variance` instead of them. These require our framework to be able to inform operators current running type (training/inferencing), then operators can switch their behaviors.
+
+2. `batch_norm_op` shall have the ability to maintain `estimated_mean` and `estimated_variance` across mini-batch. In each mini-batch, `estimated_mean` is iterated by the following equations:
+
+```
+if batch_id == 0
+  estimated_mean = E[x]
+else
+  estimated_mean = estimated_mean * momentum + (1.0 - momentum_) * E[x]
+```
+
+The iterating of `estimated_variance` is similar. `momentum` is an attribute, which controls estimated_mean updating speed.
+
+## Implementation
+
+Batch normalization is designed as a single operator is C++, and then wrapped as a layer in Python.
+
+### C++
+
+As most C++ operators do, `batch_norm_op` is defined by inputs, outputs, attributes and compute kernels.
+
+#### Inputs
+
+- `x`: The inputs data, which is generated by the previous layer.
+- `estimated_mean`: The estimated mean of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `E[x]`.
+- `estimated_var`: The estimated standard deviation of all previous data batches. It is updated in each forward propagation and will be used in inferencing to take the role of `STD[x]`.
+- `scale`: trainable parameter 'scale'
+- `bias`: trainable parameter 'bias'
+
+#### Outputs
+
+- `y`: The output data.
+- `batch_mean`: The mean value of batch data.
+- `batch_var`: The standard deviation value of batch data.
+- `saved_mean`: Updated `estimated_mean` with current batch data. It's supposed to share the memory with input `estimated_mean`.
+- `saved_var`: Updated `estimated_var` with current batch data. It's supposed to share the memory with input `estimated_var`.
+
+#### Attributes
+
+- `is_infer`: *bool*. If true, run `batch_norm_op` in inferencing mode.
+- `use_global_est`: *bool*. If true, use `saved_mean` and `saved_var` instead of `E[x]` and `STD[x]` in trainning.
+- `epsilon`: *float*. The epsilon value to avoid division by zero.
+- `momentum`: *float*. Factor used in `estimated_mean` and `estimated_var` updating. The usage is shown above.
+
+#### Kernels
+
+The following graph showes the training computational process of `batch_norm_op`:
+
+<img src="./images/batch_norm_op_kernel.png" width="800"/>
+
+cudnn provides APIs to finish the whole series of computation, we can use them in our GPU kernel.
+
+### Python
+
+`batch_norm_op` is warpped as a layer in Python:
+
+```python 
+def batch_norm_layer(net, 
+                     input,
+                     output, 
+                     scale, 
+                     bias, 
+                     use_global_est = False, 
+                     epsilon = 1e-6,
+                     momentum = 0.99):
+	mean_cache = scope.new_var(name = 'estimated_mean', trainable = False)
+	var_cache = scop.new_var(name = 'estimated_var', trainable = False)
+	batch_mean = scope.new_var(name = 'batch_mean')
+	batch_var = scope.new_var(name = 'batch_var')
+	batch_norm_op = Operator('batch_norm_op',
+	                         x = input,
+	                         estimated_mean = mean_cache,
+	                         estimated_mean = var_cache,
+	                         scale = scale,
+	                         bias = bias,
+	                         y = output,
+	                         batch_mean = batch_mean,
+	                         batch_var = batch_var,
+	                         saved_mean = mean_cache,
+	                         saved_var = var_cache,
+	                         is_infer = False,
+	                         use_global_est = use_global_est,
+	                         epsilon = epsilon,
+	                         momentum = momentum)
+	net.append_op(batch_norm_op)
+	return output
+```
+
+Because Python API has not been finally decided, the code above can be regarded as pseudo code. There are a few key points we shall note:
+
+1. `estimated_mean` and `estimated_var` are assigned the same variables with `saved_mean` and `saved_var` respectively. So they share same the memories. The output mean and variance values(`saved_mean` and `saved_var`) of a certain batch will be the inputs(`estimated_mean` and `estimated_var`) of the next batch.
+
+2. `is_infer` decided whether `batch_norm_op` will run in training mode or inferencing mode. However, a network may contains both training and inferencing parts. And user may switch `batch_norm_op`'s running mode in Python `for` loop like this:
+
+```python
+for pass_id in range(PASS_NUM):
+    # ...
+    net.train()  # run training model
+    if pass_id % 100 == 0:
+        net.infer(test_image)    # run inferencing model
+    # ...
+``` 
+
+`is_infer` is an attribute. Once an operator is created, its attributes can not be changed. It suggests us that we shall maintain two `batch_norm_op` in the model, one's `is_infer` is `True`(we call it `infer_batch_norm_op`) and the other one's is `False`(we call it `train_batch_norm_op`). They share all parameters and variables, but be placed in two different branches. That is to say, if a network contains a `batch_norm_op`, it will fork into two branches, one go through `train_batch_norm_op` and the other one go through `infer_batch_norm_op`:
+
+<div align=center>
+<img src="./images/batch_norm_fork.png" width="500"/>
+</div>
+
+Just like what is shown in the above graph, the net forks before `batch_norm_op` and will never merge again. All the operators after `batch_norm_op` will duplicate. 
+
+When the net runs in training mode, the end of the left branch will be set as the running target, so the dependency tracking process will ignore right branch automatically. When the net runs in inferencing mode, the process is reversed.
+
+How to set a target is related to Python API design, so I will leave it here waiting for more discussions.
diff --git a/paddle/operators/clip_op.cc b/paddle/operators/clip_op.cc
index b3dd060fd725fc9056b25e4affd82fdb345e77f7..f80204c6833d6436f2cf21610beea45b36787eea 100644
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
@@ -21,15 +21,14 @@ class ClipOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ClipOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ClipOp should not be null.");
     auto x_dims = ctx->GetInputDim("X");
-    auto max = Attr<float>("max");
-    auto min = Attr<float>("min");
+    auto max = ctx->Attrs().Get<float>("max");
+    auto min = ctx->Attrs().Get<float>("min");
     PADDLE_ENFORCE_LT(min, max, "max should be greater than min.");
     ctx->SetOutputDim("Out", x_dims);
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -60,8 +59,7 @@ class ClipOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
diff --git a/paddle/operators/clip_op.h b/paddle/operators/clip_op.h
index ce1d4e1f460414e6e4acee4fa3207f309c55d86b..ac702e9935201ba5263a80ebeb1ab22fa0bd1340 100644
--- a/paddle/operators/clip_op.h
+++ b/paddle/operators/clip_op.h
@@ -56,7 +56,7 @@ class ClipGradFunctor {
 };
 
 template <typename Place, typename T>
-class ClipKernel : public framework::OpKernel {
+class ClipKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto max = context.Attr<T>("max");
@@ -73,7 +73,7 @@ class ClipKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class ClipGradKernel : public framework::OpKernel {
+class ClipGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto max = context.Attr<T>("max");
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 1ffa02c8f94c01a385d3ba376c1fd0dc3c1bd372..e11e51b4583817ef50cd447dbcf4c7202a152422 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -23,8 +23,7 @@ class ConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
                       "Inputs(X) of ConcatOp should be empty.")
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -82,8 +81,7 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
                const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
   }
 };
diff --git a/paddle/operators/concat_op.h b/paddle/operators/concat_op.h
index b37063261123bce1f22c39ab021e88f2faf58e9f..c113f19fb5cf806709bff845ee0f1078b34014bb 100644
--- a/paddle/operators/concat_op.h
+++ b/paddle/operators/concat_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ConcatKernel : public framework::OpKernel {
+class ConcatKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -44,7 +44,7 @@ class ConcatKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class ConcatGradKernel : public framework::OpKernel {
+class ConcatGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/cond_op.cc b/paddle/operators/cond_op.cc
index aaffa6661fe4686d09f20f0f0682219772638202..adcd867f502d166f851926fde602dbb3fed9b48e 100644
--- a/paddle/operators/cond_op.cc
+++ b/paddle/operators/cond_op.cc
@@ -14,12 +14,7 @@ limitations under the License. */
 
 #include "paddle/operators/cond_op.h"
 
-#include <cstring>
-#include <sstream>
-
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/gather.h"
-#include "paddle/operators/net_op.h"
 #include "paddle/operators/scatter.h"
 
 namespace paddle {
@@ -31,175 +26,183 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 using DDim = framework::DDim;
 
-void CondOp::CreateScope(const Scope& scope) const {
+framework::Scope& CondOp::AddSubScope(const Scope& scope) const {
   auto sub_scopes_var = scope.FindVar("SubScopes");
   PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
                           "Output(SubScopes) of CondOp should not be null.");
   auto sub_scopes = sub_scopes_var->GetMutable<std::vector<Scope*>>();
   auto& sub_scope = scope.NewScope();
   sub_scopes->push_back(&sub_scope);
+  return sub_scope;
 }
 
-void CondOp::CreateIndexTensor(const Scope& scope) const {
+std::vector<framework::Scope*>& CondOp::GetSubScopes(
+    const framework::Scope& scope) const {
+  auto sub_scopes_var = scope.FindVar("SubScopes");
+  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
+                          "Output(SubScopes) of CondOp should not be null.");
+  return *sub_scopes_var->GetMutable<std::vector<framework::Scope*>>();
+}
+
+LoDTensor& CondOp::AddIndexTensor(const Scope& scope) const {
   auto index_tensors_var = scope.FindVar("IndexTensors");
   PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
                           "Output(IndexTensors) of CondOp should not be null.");
   auto& index_tensors =
       *index_tensors_var->GetMutable<std::vector<LoDTensor>>();
   index_tensors.push_back(LoDTensor());
+  return index_tensors.back();
 }
 
-void CondOp::InferShape(const Scope& scope) const {
-  auto sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto& sub_scopes = *sub_scopes_var->GetMutable<std::vector<Scope*>>();
-
-  for (int i = 0; i < 2; ++i) {
-    // Create two sub scopes for true and false branches
-    // sub_scopes[0] for the true branch and sub_scopes[1] for the false
-    // branch
-    CreateScope(scope);
-
-    // Create two tensors for true and false indices
-    // index_tensors[0] for the true branch and index_tensors[1] for the false
-    // branch
-    CreateIndexTensor(scope);
-
-    PADDLE_ENFORCE(!Inputs("Xs").empty(),
-                   "Inputs(Xs) of CondOp can't be empty.");
-    for (auto& input : Inputs("Xs")) {
-      // Create a new tensor in sub-scope for input-type tensor
-      Variable* v = sub_scopes[i]->NewVar(input);
-      LoDTensor* sub_input = v->GetMutable<LoDTensor>();
-      sub_input->Resize(scope.FindVar(input)->GetMutable<LoDTensor>()->dims());
-    }
-
-    for (auto& output : (*sub_net_op_[i]).Outputs()) {
-      for (auto& var_name : output.second) {
-        sub_scopes[i]->NewVar(var_name);
-      }
-    }
-
-    // each net calls InferShape
-    //    sub_net_op_[i]->InferShape(*sub_scopes[i]);
-  }
-
-  for (auto& output : Outputs("Outs")) {
-    LoDTensor* tensor_t_out =
-        sub_scopes[0]->FindVar(output)->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
-    LoDTensor* tensor_f_out =
-        sub_scopes[1]->FindVar(output)->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
-
-    auto* tensor_out_var = scope.FindVar(output);
-    PADDLE_ENFORCE_NOT_NULL(tensor_out_var, "Output not found");
-    LoDTensor* tensor_out = tensor_out_var->GetMutable<LoDTensor>();
-    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
-                            "True output tensor should not be NULL");
-
-    // check output size should be same
-    PADDLE_ENFORCE_EQ(tensor_t_out->dims(), tensor_f_out->dims(),
-                      "Outputs not of the same shape");
-    tensor_out->Resize(tensor_t_out->dims());
-    // tensor_out->mutable_data<float>(tensor_out->dims(),
-    // platform::CPUPlace());
-    tensor_out->mutable_data<float>(platform::CPUPlace());
-  }
-}
-
-void CondOp::Run(const Scope& scope,
-                 const platform::DeviceContext& dev_ctx) const {
-  auto* sub_scopes_var = scope.FindVar("SubScopes");
-  PADDLE_ENFORCE_NOT_NULL(sub_scopes_var,
-                          "Output(SubScopes) of CondOp should not be null.");
-  auto sub_scopes = sub_scopes_var->Get<std::vector<Scope*>>();
+std::vector<framework::LoDTensor>& CondOp::GetIndexTensors(
+    const framework::Scope& scope) const {
   auto* index_tensors_var = scope.FindVar("IndexTensors");
   PADDLE_ENFORCE_NOT_NULL(index_tensors_var,
                           "Output(IndexTensors) of CondOp should not be null.");
-  auto index_tensors = index_tensors_var->Get<std::vector<LoDTensor>>();
+  return *index_tensors_var->GetMutable<std::vector<framework::LoDTensor>>();
+}
 
-  std::string cond_name = Input("Cond");
-  Variable* cond_var = scope.FindVar(cond_name);
+void CondOp::PrepareDataForSubnet(
+    const framework::Scope& scope,
+    const platform::DeviceContext& dev_ctx) const {
+  PADDLE_ENFORCE(!Inputs("Xs").empty(), "Inputs(Xs) of CondOp can't be empty.");
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    // Create two sub scopes for true and false branches
+    //   sub_scopes[0] for the true branch
+    //   sub_scopes[1] for the false branch
+    AddSubScope(scope);
+    // Create two tensors for true and false indices:
+    //   index_tensors[0] for the true branch
+    //   index_tensors[1] for the false branch
+    AddIndexTensor(scope);
+  }
+
+  Variable* cond_var = scope.FindVar(Input("Cond"));
   PADDLE_ENFORCE_NOT_NULL(cond_var,
                           "Input(Cond) of CondOp should not be null.");
   const LoDTensor* cond = cond_var->GetMutable<LoDTensor>();
 
-  // Step 1: get the true/false index at runtime
-  // index_[0]: vector<int>, contains all index for cond[i] == true
-  // index_[1]: vector<int>, contains all index for cond[i] == false
-  for (int i = 0; i < 2; ++i) index_[i].clear();
+  // get the true/false index at runtime according to cond tensor
+  // index_vectors[0]: vector<int>, contains all index for cond[i] == true
+  // index_vectors[1]: vector<int>, contains all index for cond[i] == false
+  std::vector<std::vector<int>> index_vectors;
+  index_vectors.resize(BRANCH_NUM);
 
   const int* cond_data = cond->data<int>();
   for (int i = 0; i < cond->dims()[0]; ++i) {
     if (cond_data[i])
-      index_[0].push_back(i);
+      index_vectors[TRUE_BRANCH].push_back(i);
     else
-      index_[1].push_back(i);
+      index_vectors[FALSE_BRANCH].push_back(i);
   }
 
-  // put index_[0] and index_[1] into two tensors:
-  // index_tensor_[0] and index_tensor_[1]
-  DDim dim = paddle::framework::make_ddim({0});
-  for (int i = 0; i < 2; ++i) {
-    dim[0] = index_[i].size();
-    int* tmp_ptr =
+  // put index_vectors[0] and index_vectors[1] into two tensors:
+  // index_tensors[0] and index_tensors[1]
+  std::vector<framework::LoDTensor>& index_tensors = GetIndexTensors(scope);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    DDim dim = {static_cast<int64_t>(index_vectors[i].size())};
+    int* index_tensor_data_ptr =
         index_tensors[i].mutable_data<int>(dim, platform::CPUPlace());
-    index_tensors[i].Resize(dim);
-    memcpy(tmp_ptr, index_[i].data(), dim[0] * sizeof(int));
+    memcpy(index_tensor_data_ptr, index_vectors[i].data(),
+           dim[0] * sizeof(int));
   }
 
-  // Step 2: collect data by calling gather
-  for (int i = 0; i < 2; ++i) {
-    // i= 0/i for True and False branches respectively
-    for (auto& input : Inputs("Xs")) {
-      // find Tensor
-      Variable* v = scope.FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
+  // create input in subscopes according to index_vectors
+  for (auto& input : Inputs("Xs")) {
+    Variable* var_parent = scope.FindVar(input);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    const auto* tensor_parent = &var_parent->Get<LoDTensor>();
 
-      v = sub_scopes[i]->FindVar(input);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(input);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = var_child->GetMutable<LoDTensor>();
 
       // Resize child
-      DDim dim = tensor_child->dims();
-      dim[0] = index_[i].size();
-      tensor_child->Resize(dim);
+      DDim dim = tensor_parent->dims();
+      dim[0] = index_tensors[i].dims()[0];
       tensor_child->mutable_data<float>(dim, platform::CPUPlace());
 
-      Gather<float>(dev_ctx.GetPlace(), tensor_parent, &index_tensors[i],
-                    tensor_child);
+      CPUGather<float>(dev_ctx, *tensor_parent, index_tensors[i], tensor_child);
     }
   }
 
-  // Step 3: run
-  for (int i = 0; i < 2; ++i) {
-    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  // create output_tensors in subscope for sub_net
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    for (auto& output : (*sub_net_op_[i]).Outputs()) {
+      for (auto& var_name : output.second) {
+        sub_scopes[i]->Var(var_name);
+      }
+    }
   }
+}
+
+void CondOp::MergeDataFromSubnet(const framework::Scope& scope,
+                                 const platform::DeviceContext& dev_ctx) const {
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  const std::vector<framework::LoDTensor>& index_tensors =
+      GetIndexTensors(scope);
 
-  // Step 4: merge output results
+  // Infer the output dim, out_dim[0] = true_dim[0] + false_dim[0]
   PADDLE_ENFORCE(!Outputs("Outs").empty(),
                  "Outputs(Outs) of CondOp can't be empty.");
-  for (int i = 0; i < 2; ++i) {
-    // i= 0/i for True and False branches respectively
-    for (auto& output : Outputs("Outs")) {
-      // find Tensor
-      Variable* v = scope.FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_parent = v->GetMutable<LoDTensor>();
-
-      v = sub_scopes[i]->FindVar(output);
-      PADDLE_ENFORCE_NOT_NULL(v);
-      LoDTensor* tensor_child = v->GetMutable<LoDTensor>();
-
-      ScatterUpdate<float>(dev_ctx.GetPlace(), tensor_child, &index_tensors[i],
+  for (auto& output : Outputs("Outs")) {
+    const LoDTensor* tensor_t_out =
+        &sub_scopes[TRUE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out, "True output should not be NULL");
+    const LoDTensor* tensor_f_out =
+        &sub_scopes[FALSE_BRANCH]->FindVar(output)->Get<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_f_out, "False output should not be NULL");
+
+    auto* var_out = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_out, "Output not found");
+    LoDTensor* tensor_out = var_out->GetMutable<LoDTensor>();
+    PADDLE_ENFORCE_NOT_NULL(tensor_t_out,
+                            "True output tensor should not be NULL");
+
+    DDim true_dim = tensor_t_out->dims();
+    DDim false_dim = tensor_f_out->dims();
+    true_dim[0] = 0;
+    false_dim[0] = 0;
+    PADDLE_ENFORCE_EQ(true_dim, false_dim,
+                      "Outputs not of the same shape except the first dim");
+
+    DDim out_dim = tensor_t_out->dims();
+    out_dim[0] = tensor_t_out->dims()[0] + tensor_f_out->dims()[0];
+    tensor_out->Resize(out_dim);
+    tensor_out->mutable_data<float>(platform::CPUPlace());
+  }
+
+  // merge output results:
+  // output_tensor = true_output_tensor + false_output_tensor
+  for (auto& output : Outputs("Outs")) {
+    Variable* var_parent = scope.FindVar(output);
+    PADDLE_ENFORCE_NOT_NULL(var_parent);
+    auto* tensor_parent = var_parent->GetMutable<LoDTensor>();
+
+    for (int i = 0; i < BRANCH_NUM; ++i) {
+      Variable* var_child = sub_scopes[i]->FindVar(output);
+      PADDLE_ENFORCE_NOT_NULL(var_child);
+      auto* tensor_child = &var_child->Get<LoDTensor>();
+      ScatterAssign<float>(dev_ctx, *tensor_child, index_tensors[i],
                            tensor_parent);
     }
   }
 }
 
+void CondOp::Run(const Scope& scope,
+                 const platform::DeviceContext& dev_ctx) const {
+  PrepareDataForSubnet(scope, dev_ctx);
+  std::vector<framework::Scope*>& sub_scopes = GetSubScopes(scope);
+  for (int i = 0; i < BRANCH_NUM; ++i) {
+    sub_net_op_[i]->Run(*sub_scopes[i], dev_ctx);
+  }
+  MergeDataFromSubnet(scope, dev_ctx);
+}
+
 class CondOpProtoAndCheckerMaker : public framework::OpProtoAndCheckerMaker {
  public:
   CondOpProtoAndCheckerMaker(framework::OpProto* proto,
diff --git a/paddle/operators/cond_op.h b/paddle/operators/cond_op.h
index 9a88ee35f108204348baddc57e0c0d8e63c3fb6d..93121fb31be287794249b5a62386d5a8dd268a0c 100644
--- a/paddle/operators/cond_op.h
+++ b/paddle/operators/cond_op.h
@@ -40,8 +40,7 @@ class CondOp : public framework::OperatorBase {
          const framework::VariableNameMap& outputs,
          const framework::AttributeMap& attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
-    index_.resize(2);
-    sub_net_op_.resize(2);
+    sub_net_op_.resize(BRANCH_NUM);
   }
 
   CondOp(const CondOp& o)
@@ -51,42 +50,44 @@ class CondOp : public framework::OperatorBase {
     PADDLE_THROW("Not implemented");
   }
 
-  void CreateScope(const framework::Scope& scope) const;
+  framework::Scope& AddSubScope(const framework::Scope& scope) const;
+  std::vector<framework::Scope*>& GetSubScopes(
+      const framework::Scope& scope) const;
 
-  void CreateIndexTensor(const framework::Scope& scope) const;
+  framework::LoDTensor& AddIndexTensor(const framework::Scope& scope) const;
+  std::vector<framework::LoDTensor>& GetIndexTensors(
+      const framework::Scope& scope) const;
 
-  /*
-   * InferShape must be called before Run.
-   * FIXME(yuyang18): Since InferShape has been removed, this implementation
-   * could be wrong.
-   */
-  void InferShape(const framework::Scope& scope) const;
+  void PrepareDataForSubnet(const framework::Scope& scope,
+                            const platform::DeviceContext& dev_ctx) const;
+  void MergeDataFromSubnet(const framework::Scope& scope,
+                           const platform::DeviceContext& dev_ctx) const;
 
   /*
    * Set True Block
    */
   void set_truenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[0] = std::move(net);
+    sub_net_op_[TRUE_BRANCH] = std::move(net);
   }
 
   /*
    * Set False Block
    */
   void set_falsenet(std::unique_ptr<OperatorBase>&& net) {
-    sub_net_op_[1] = std::move(net);
+    sub_net_op_[FALSE_BRANCH] = std::move(net);
   }
 
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const override;
 
  private:
+  const int TRUE_BRANCH = 0;
+  const int FALSE_BRANCH = 1;
+  const int BRANCH_NUM = 2;
+
   // sub_net_op_[0]: subnet_t
   // sub_net_op_[1]: subnet_f
   std::vector<std::unique_ptr<framework::OperatorBase>> sub_net_op_;
-
-  // index_[0]: True_index;
-  // index_[1]: False_index;
-  mutable std::vector<std::vector<int>> index_;
 };
 
 }  // namespace operators
diff --git a/paddle/operators/conv2d_op.cc b/paddle/operators/conv2d_op.cc
index 5cc82944bb6b9a4fc5cd94cf2233ab84fc105fe7..1acb8415d0691df77047806d3c81b51cbb8c59f3 100644
--- a/paddle/operators/conv2d_op.cc
+++ b/paddle/operators/conv2d_op.cc
@@ -12,111 +12,91 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/gemm_conv2d_op.h"
+#include "paddle/operators/conv2d_op.h"
 
 namespace paddle {
 namespace operators {
 
-int outputSize(int input_size, int filter_size, int padding, int stride) {
-  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
-  return output_size;
+void Conv2DOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of Conv2DOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of Conv2DOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of Conv2DOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+  int groups = ctx->Attrs().Get<int>("groups");
+  int input_channels = in_dims[1];
+  int output_channels = filter_dims[0];
+
+  PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
+  PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
+  PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
+                    "The number of input channels should be equal to filter "
+                    "channels * groups.");
+  PADDLE_ENFORCE_EQ(
+      output_channels % groups, 0,
+      "The number of output channels should be divided by groups.");
+
+  auto output_height =
+      OutputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
+  auto output_width =
+      OutputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
+  ctx->SetOutputDim("Output",
+                    {in_dims[0], filter_dims[0], output_height, output_width});
 }
 
-class Conv2DOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Input"),
-                   "Input(Input) of Conv2DOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Filter"),
-                   "Input(Filter) of Conv2DOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Output"),
-                   "Output(Output) of Conv2DOp should not be null.");
-
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    int groups = ctx->Attrs().Get<int>("groups");
-    int input_channels = in_dims[1];
-    int output_channels = filter_dims[0];
-
-    PADDLE_ENFORCE_EQ(in_dims.size(), 4, "Conv2DOp input should be 4-D.");
-    PADDLE_ENFORCE_EQ(filter_dims.size(), 4, "Conv2DOp filter should be 4-D.");
-    PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
-                      "The number of input channels should be equal to filter "
-                      "channels * groups.");
-    PADDLE_ENFORCE_EQ(
-        output_channels % groups, 0,
-        "The number of output channels should be divided by groups.");
-
-    auto output_height =
-        outputSize(in_dims[2], filter_dims[2], paddings[0], strides[0]);
-    auto output_width =
-        outputSize(in_dims[3], filter_dims[3], paddings[1], strides[1]);
-    ctx->SetOutputDim(
-        "Output", {in_dims[0], filter_dims[0], output_height, output_width});
-  }
-};
-
-class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  Conv2DOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput(
-        "Input",
-        "The input tensor of convolution operator. "
-        "The format of input tensor is NCHW. Where N is batch size, C is the "
-        "number of channels, H and W is the height and width of image.");
-    AddInput(
-        "Filter",
-        "The filter tensor of convolution operator."
-        "The format of the filter tensor is MCHW, where M is the number of "
-        "output image channels, C is the number of input image channels, "
-        "H and W is height and width of filter. "
-        "If the groups attribute is greater than 1, C equal the number of "
-        "input image channels divided by the groups.");
-    AddOutput("Output",
-              "The output tensor of convolution operator."
-              "The format of output tensor is also NCHW.");
-    AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
-        .SetDefault({1, 1});
-    AddAttr<std::vector<int>>("paddings", "paddings of convolution operator.")
-        .SetDefault({0, 0});
-    AddAttr<int>(
-        "groups",
-        "group size of convolution operator. "
-        "Refer to grouped convolution in Alex Krizhevsky's paper: "
-        "when group=2, the first half of the filters are only connected to the "
-        "first half of the input channels, and the second half only connected "
-        "to the second half.")
-        .SetDefault(1);
-    AddComment(R"DOC(
+Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
+                             framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "The input tensor of convolution operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of channels, H and W is the height and width of image.");
+  AddInput("Filter",
+           "The filter tensor of convolution operator."
+           "The format of the filter tensor is MCHW, where M is the number of "
+           "output image channels, C is the number of input image channels, "
+           "H and W is height and width of filter. "
+           "If the groups attribute is greater than 1, C equal the number of "
+           "input image channels divided by the groups.");
+  AddOutput("Output",
+            "The output tensor of convolution operator."
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("strides", "strides of convolution operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>("paddings", "paddings of convolution operator.")
+      .SetDefault({0, 0});
+  AddAttr<int>(
+      "groups",
+      "group size of convolution operator. "
+      "Refer to grouped convolution in Alex Krizhevsky's paper: "
+      "when group=2, the first half of the filters are only connected to the "
+      "first half of the input channels, and the second half only connected "
+      "to the second half.")
+      .SetDefault(1);
+  AddComment(R"DOC(
 The convolution operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
 )DOC");
-  }
-};
-
-class Conv2DOpGrad : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+}
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    auto in_dims = ctx->GetInputDim("Input");
-    auto filter_dims = ctx->GetInputDim("Filter");
-    if (ctx->HasOutput(framework::GradVarName("Input"))) {
-      ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
-    }
-    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
-      ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
-    }
+void Conv2DOpGrad::InferShape(framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
   }
-};
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
 
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/conv2d_op.cu b/paddle/operators/conv2d_op.cu
index 5df818ba0496a65502dde37fd1397ec56f8c1101..c697c9466d34c29af6976f3a4d2d0a24ba778ceb 100644
--- a/paddle/operators/conv2d_op.cu
+++ b/paddle/operators/conv2d_op.cu
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/gemm_conv2d_op.h"
+#include "paddle/operators/conv2d_op.h"
 
 namespace ops = paddle::operators;
 
diff --git a/paddle/operators/gemm_conv2d_op.h b/paddle/operators/conv2d_op.h
similarity index 76%
rename from paddle/operators/gemm_conv2d_op.h
rename to paddle/operators/conv2d_op.h
index 5c9e81732aa72211c2021382cf9a907880c53c17..0621389a79eee6b5e75b1eab309b49f8aa4a97ca 100644
--- a/paddle/operators/gemm_conv2d_op.h
+++ b/paddle/operators/conv2d_op.h
@@ -24,8 +24,38 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
+// Base convolution operator definations for other conv
+// like operators to reuse the implementation.
+inline int OutputSize(int input_size, int filter_size, int padding,
+                      int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+// Define Op classes in .h file so that other conv
+// operator implementations can reuse the code.
+class Conv2DOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class Conv2DOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class Conv2DOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
 template <typename Place, typename T>
-class GemmConv2DKernel : public framework::OpKernel {
+class GemmConv2DKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* input = context.Input<Tensor>("Input");
@@ -74,22 +104,21 @@ class GemmConv2DKernel : public framework::OpKernel {
 
     framework::DDim output_matrix_shape = {output_channels,
                                            output_height * output_width};
-
     // convolution operator: im2col + gemm
     int in_step = input_channels / groups;
     int out_step = output_channels / groups;
     for (int i = 0; i < batch_size; i++) {
-      Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
-      Tensor out_batch = output->Slice<T>(i, i + 1).Resize(output_matrix_shape);
+      Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
+      Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);
       for (int g = 0; g < groups; g++) {
         // im2col
-        Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+        Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
         im2col(context.device_context(), in_slice, col, strides[0], strides[1],
-               paddings[0], paddings[1]);
+               paddings[0], paddings[0], paddings[1], paddings[1]);
 
         // gemm
-        Tensor out_slice = out_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-        Tensor filter_slice = filter.Slice<T>(g * out_step, (g + 1) * out_step);
+        Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
+        Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
         math::matmul<Place, T>(context.device_context(), filter_slice, false,
                                col_matrix, false, T(1.0), &out_slice, T(0.0));
       }
@@ -98,7 +127,7 @@ class GemmConv2DKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class GemmConvGrad2DKernel : public framework::OpKernel {
+class GemmConvGrad2DKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* input = context.Input<Tensor>("Input");
@@ -169,24 +198,23 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
-            output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_grad_batch =
-            input_grad->Slice<T>(i, i + 1).Resize(input_shape);
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_grad_batch = input_grad->Slice(i, i + 1).Resize(input_shape);
         for (int g = 0; g < groups; g++) {
           // gemm
           Tensor out_grad_slice =
-              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-          Tensor filter_slice =
-              filter.Slice<T>(g * out_step, (g + 1) * out_step);
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
           math::matmul<Place, T>(context.device_context(), filter_slice, true,
                                  out_grad_slice, false, T(1.0), &col_matrix,
                                  T(0.0));
 
           // col2im
           Tensor in_grad_slice =
-              in_grad_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+              in_grad_batch.Slice(g * in_step, (g + 1) * in_step);
           col2im(context.device_context(), in_grad_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[1]);
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
         }
       }
     }
@@ -200,19 +228,20 @@ class GemmConvGrad2DKernel : public framework::OpKernel {
 
       for (int i = 0; i < batch_size; i++) {
         Tensor out_grad_batch =
-            output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
-        Tensor in_batch = input->Slice<T>(i, i + 1).Resize(input_shape);
+            output_grad->Slice(i, i + 1).Resize(output_matrix_shape);
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
         for (int g = 0; g < groups; g++) {
           // im2col
           Tensor out_grad_slice =
-              out_grad_batch.Slice<T>(g * out_step, (g + 1) * out_step);
-          Tensor in_slice = in_batch.Slice<T>(g * in_step, (g + 1) * in_step);
+              out_grad_batch.Slice(g * out_step, (g + 1) * out_step);
+          Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);
           im2col(context.device_context(), in_slice, col, strides[0],
-                 strides[1], paddings[0], paddings[1]);
+                 strides[1], paddings[0], paddings[0], paddings[1],
+                 paddings[1]);
 
           // gemm
           Tensor filter_grad_slice =
-              filter_grad_.Slice<T>(g * out_step, (g + 1) * out_step);
+              filter_grad_.Slice(g * out_step, (g + 1) * out_step);
           math::matmul<Place, T>(context.device_context(), out_grad_slice,
                                  false, col_matrix, true, T(1.0),
                                  &filter_grad_slice, T(1.0));
diff --git a/paddle/operators/conv2dtranspose_op.cc b/paddle/operators/conv2dtranspose_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c1b231906e2f172b6f9cee55f850d1a5ec6c3221
--- /dev/null
+++ b/paddle/operators/conv2dtranspose_op.cc
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv2dtranspose_op.h"
+
+namespace paddle {
+namespace operators {
+
+void Conv2DTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Input"),
+                 "Input(Input) of Conv2DTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Filter"),
+                 "Input(Filter) of Conv2DTransposeOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Output"),
+                 "Output(Output) of Conv2DTransposeOp should not be null.");
+
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    PADDLE_ENFORCE_EQ(paddings[i], 0,
+                      "No Padding allowed in conv transpose op.");
+  }
+
+  PADDLE_ENFORCE_EQ(in_dims.size(), 4,
+                    "Conv2DTransposeOp input should be 4-D tensor.");
+  PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
+                    "Conv2DTransposeOp filter should be 4-D tensor.");
+  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
+                    "input and kernel input dimension should be equal.");
+
+  auto output_height = (in_dims[2] - 1) * strides[0] + filter_dims[2];
+  auto output_width = (in_dims[3] - 1) * strides[1] + filter_dims[3];
+  ctx->SetOutputDim("Output",
+                    {in_dims[0], filter_dims[1], output_height, output_width});
+}
+
+Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
+    framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "Input",
+      "(Tensor) The input tensor of convolution transpose operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of input channels, H and W is the height and width of image.");
+  AddInput("Filter",
+           "(Tensor) The filter tensor of convolution transpose operator."
+           "The format of the filter tensor is CMHW, where C is the number of "
+           "output image channels, M is the number of input image channels, "
+           "H and W is height and width of filter. "
+           "We enforce groups number == 1 and padding == 0 in "
+           "convolution transpose Scenario.");
+  AddOutput("Output",
+            "(Tensor) The output tensor of convolution transpose operator."
+            "The format of output tensor is also NCHW.");
+  AddAttr<std::vector<int>>("strides",
+                            "strides of convolution transpose operator.")
+      .SetDefault({1, 1});
+  AddAttr<std::vector<int>>("paddings",
+                            "paddings of convolution transpose operator.")
+      .SetDefault({0, 0});
+  AddComment(R"DOC(
+The convolution transpose operation calculates the output based on the input, filter
+and strides, paddings, groups parameters. The size of each dimension of the
+parameters is checked in the infer-shape.
+)DOC");
+}
+
+void Conv2DTransposeOpGrad::InferShape(
+    framework::InferShapeContext* ctx) const {
+  auto in_dims = ctx->GetInputDim("Input");
+  auto filter_dims = ctx->GetInputDim("Filter");
+  if (ctx->HasOutput(framework::GradVarName("Input"))) {
+    ctx->SetOutputDim(framework::GradVarName("Input"), in_dims);
+  }
+  if (ctx->HasOutput(framework::GradVarName("Filter"))) {
+    ctx->SetOutputDim(framework::GradVarName("Filter"), filter_dims);
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv2dtranspose, ops::Conv2DTransposeOp,
+            ops::Conv2DTransposeOpMaker, conv2dtranspose_grad,
+            ops::Conv2DTransposeOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    conv2dtranspose,
+    ops::GemmConv2DTransposeKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv2dtranspose_grad,
+    ops::GemmConv2DTransposeGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv2dtranspose_op.cu b/paddle/operators/conv2dtranspose_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..761bc1959e69be94f43571728e6b92a322558b99
--- /dev/null
+++ b/paddle/operators/conv2dtranspose_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv2dtranspose_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    conv2dtranspose,
+    ops::GemmConv2DTransposeKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv2dtranspose_grad,
+    ops::GemmConv2DTransposeGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/conv2dtranspose_op.h b/paddle/operators/conv2dtranspose_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8c70b3dcec1e26ab3d8a42d88040764c643b5ae6
--- /dev/null
+++ b/paddle/operators/conv2dtranspose_op.h
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/im2col.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+
+// Define Op classes in .h file so that other conv transpose
+// operator implementations can reuse the code.
+class Conv2DTransposeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Conv2DTransposeOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker);
+};
+
+class Conv2DTransposeOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class Conv2DTransposeOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+template <typename Place, typename T>
+class GemmConv2DTransposeKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    // The filter will be reshaped, so it should not be constant pointer
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    Tensor* output = context.Output<Tensor>("Output");
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+
+    // TODO(Zhuoyuan): Paddings can be added in future.
+    // groups will alway be disabled in conv2dtranspose.
+
+    const int batch_size = input->dims()[0];
+    const int m = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+
+    const int k_h = filter.dims()[2];
+    const int k_w = filter.dims()[3];
+
+    const int c = output->dims()[1];  // output channels
+    const int o_h = output->dims()[2];
+    const int o_w = output->dims()[3];
+
+    paddle::operators::math::Col2ImFunctor<
+        paddle::operators::math::ColFormat::kCFO, Place, T>
+        col2im;
+
+    // use col_shape in the im2col and col2im calculation
+    DDim col_shape = {c, k_h, k_w, h, w};
+
+    // use col_matrix_shape in the gemm calculation
+    DDim col_matrix_shape = {c * k_h * k_w, h * w};
+
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+    Tensor col_matrix;
+    col_matrix.ShareDataWith(col);
+    col_matrix.Resize(col_matrix_shape);
+
+    DDim output_shape = {c, o_h, o_w};
+    DDim input_matrix_shape = {m, h * w};
+
+    DDim filter_matrix_shape = {m, c * k_h * k_w};
+    filter.Resize(filter_matrix_shape);
+
+    // convolution transpose: gemm + col2im (similar to conv-backward on input)
+
+    output->mutable_data<T>(context.GetPlace());
+    auto t = framework::EigenVector<T>::Flatten(*output);
+    t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+
+    for (int i = 0; i < batch_size; i++) {
+      // batch with size (M, h * w)
+      Tensor input_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+      // filter size: (M, c * k_h * k_w)
+
+      // output size: (c, o_h, o_w)
+      Tensor output_batch = output->Slice(i, i + 1).Resize(output_shape);
+
+      // col_matrix = filter * input_batch
+      // of shape (c * k_h * k_w, h * w)
+      math::matmul<Place, T>(context.device_context(), filter, true,
+                             input_batch, false, T(1.0), &col_matrix, T(0.0));
+      col2im(context.device_context(), output_batch, col, strides[0],
+             strides[1], 0, 0, 0, 0);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class GemmConv2DTransposeGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("Input");
+    const Tensor* output_grad =
+        context.Input<Tensor>(framework::GradVarName("Output"));
+
+    // For filter, we do not use const pointer b/c we will do reshape,
+    // but we should avoid modifying its value.
+    Tensor filter = *context.Input<Tensor>("Filter");
+
+    Tensor* input_grad =
+        context.Output<Tensor>(framework::GradVarName("Input"));
+    Tensor* filter_grad =
+        context.Output<Tensor>(framework::GradVarName("Filter"));
+
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    // Actually, no paddings and groups allowed in conv transpose.
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    const int batch_size = input->dims()[0];
+    const int m = input->dims()[1];
+    const int h = input->dims()[2];
+    const int w = input->dims()[3];
+
+    const int k_h = filter.dims()[2];
+    const int k_w = filter.dims()[3];
+
+    const int c = output_grad->dims()[1];  // output channels
+    const int o_h = output_grad->dims()[2];
+    const int o_w = output_grad->dims()[3];
+
+    // Only im2col functor required for bp to get to the right shape
+    paddle::operators::math::Im2ColFunctor<
+        paddle::operators::math::ColFormat::kCFO, Place, T>
+        im2col;
+
+    // use col_shape in the im2col and col2im calculation
+    DDim col_shape = {c, k_h, k_w, h, w};
+
+    // use col_matrix_shape in the gemm calculation
+    DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
+
+    Tensor col;
+    col.mutable_data<T>(col_shape, context.GetPlace());
+    // col_matrix shares the same piece of data with col,
+    // but will be reshaped into a two-dimensional matrix shape
+    // to call the matrix multiplication interface.
+
+    DDim output_shape = {c, o_h, o_w};
+    DDim input_matrix_shape = {m, h * w};
+
+    DDim filter_matrix_shape = {m, c * k_h * k_w};
+    filter.Resize(filter_matrix_shape);
+
+    // convolution transpose grad on input:
+    // im2col + gemm (similar to conv-forward)
+    // input need to compute gradient
+    if (input_grad) {
+      Tensor col_matrix;
+      col_matrix.ShareDataWith(col);
+      DDim col_matrix_shape = {c * k_h * k_w, h * w};
+      col_matrix.Resize(col_matrix_shape);
+
+      input_grad->mutable_data<T>(context.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+
+      for (int i = 0; i < batch_size; i++) {
+        // batch with size (c, o_h * o_w)
+        Tensor output_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_shape);
+        // filter of size (m, c * k_h * k_w)
+
+        // batch with size (m, h, w)
+        Tensor input_grad_batch =
+            input_grad->Slice(i, i + 1).Resize(input_matrix_shape);
+
+        // im2col: dy from (c, o_h, o_w) -> (c * k_h * k_w, h * w)
+        im2col(context.device_context(), output_grad_batch, col, strides[0],
+               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
+
+        // gemm: dx = filter * dy
+        // (m, c * k_h * k_w) * (c * k_h * k_w, h * w) -> (m, c, h)
+        math::matmul<Place, T>(context.device_context(), filter, false,
+                               col_matrix, false, T(1.0), &input_grad_batch,
+                               T(0.0));
+      }
+    }
+
+    // filter gradient required
+    if (filter_grad) {
+      Tensor col_matrix_f;
+      col_matrix_f.ShareDataWith(col);
+      DDim col_matrix_shape_f = {c * h * w, k_h * k_w};
+      col_matrix_f.Resize(col_matrix_shape_f);
+
+      filter_grad->mutable_data<T>(context.GetPlace());
+      Tensor filter_grad_ = *filter_grad;
+      filter_grad_.Resize(filter_matrix_shape);
+      auto t = framework::EigenVector<T>::Flatten(filter_grad_);
+      t.device(context.GetEigenDevice<Place>()) = t.constant(static_cast<T>(0));
+
+      for (int i = 0; i < batch_size; ++i) {
+        // batch with size (c, o_h, o_w)
+        Tensor output_grad_batch =
+            output_grad->Slice(i, i + 1).Resize(output_shape);
+        // input batch
+        Tensor in_batch = input->Slice(i, i + 1).Resize(input_matrix_shape);
+
+        // im2col: (c * h * w, k_h * k_w)
+        im2col(context.device_context(), output_grad_batch, col, strides[0],
+               strides[1], paddings[0], paddings[0], paddings[1], paddings[1]);
+
+        // gemm: d_filter = x * y_grad^T
+        // (m, c * h * w) * (k_h * k_w, c * h * w) -> (m, c, h)
+        math::matmul<Place, T>(context.device_context(), in_batch, false,
+                               col_matrix_f, true, T(1.0), &filter_grad_,
+                               T(1.0));
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4288f300dd5b0464f2b4394cdb0b44f93060ae74
--- /dev/null
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv2d_op.h"
+
+namespace paddle {
+namespace operators {
+
+class CudnnConvOpMaker : public Conv2DOpMaker {
+ public:
+  CudnnConvOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
+      : Conv2DOpMaker(proto, op_checker) {
+    AddAttr<std::vector<int>>("dilations", "dilations of convolution operator.")
+        .SetDefault(std::vector<int>{1, 1});
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardward. This size should be carefully setted.")
+        .SetDefault(4096);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv_cudnn, ops::Conv2DOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
+            ops::Conv2DOpGrad);
+REGISTER_OP_CPU_KERNEL(
+    conv_cudnn, ops::GemmConv2DKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv_cudnn_grad,
+    ops::GemmConvGrad2DKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv_cudnn_op.cu b/paddle/operators/conv_cudnn_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..366d0323b840c338dd6ba5b28bdb29fd135fe91a
--- /dev/null
+++ b/paddle/operators/conv_cudnn_op.cu
@@ -0,0 +1,277 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/memory/memory.h"
+#include "paddle/operators/conv2d_op.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cudnn_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
+using ScopedFilterDescriptor = platform::ScopedFilterDescriptor;
+using ScopedConvolutionDescriptor = platform::ScopedConvolutionDescriptor;
+using DataLayout = platform::DataLayout;
+using CUDADeviceContext = platform::CUDADeviceContext;
+
+static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES = 1024 * 1024 * 1024;
+
+// NOTE: framework::vectorize converts to type int64_t
+//       which does not fit cudnn inputs.
+std::vector<int> Dims2Vector(const framework::DDim& dims) {
+  std::vector<int> ret;
+  for (int i = 0; i < dims.size(); i++) {
+    ret.push_back(dims[i]);
+  }
+  return ret;
+}
+
+template <typename T>
+class CudnnConvOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto* input = ctx.Input<Tensor>("Input");
+    auto* filter = ctx.Input<Tensor>("Filter");
+    auto* output = ctx.Output<Tensor>("Output");
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    const T* input_data = input->data<T>();
+    const T* filter_data = filter->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_desc;
+    ScopedFilterDescriptor filter_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    cudnnTensorDescriptor_t cudnn_input_desc =
+        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_desc =
+        output_desc.descriptor<T>(layout, Dims2Vector(output->dims()), groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc =
+        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    int input_channels = input->dims()[1];
+    int input_height = input->dims()[2];
+    int input_width = input->dims()[3];
+    int output_channels = output->dims()[1];
+    int output_height = output->dims()[2];
+    int output_width = output->dims()[3];
+
+    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_out =
+        output_channels / groups * output_height * output_width;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn conv workspace ---------------------
+    void* cudnn_workspace = nullptr;
+    size_t workspace_size_in_bytes;  // final workspace to allocate.
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+    // ------------------- cudnn conv algorithm ---------------------
+    cudnnConvolutionFwdAlgo_t algo;
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+    // get workspace size able to allocate
+    PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, algo, &workspace_size_in_bytes));
+    // Allocate on GPU memory
+    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv forward ---------------------
+    T alpha = 1.0f, beta = 0.0f;
+    for (int i = 0; i < groups; i++) {
+      PADDLE_ENFORCE(platform::dynload::cudnnConvolutionForward(
+          handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+          cudnn_filter_desc, filter_data + i * group_offset_filter,
+          cudnn_conv_desc, algo, cudnn_workspace, workspace_size_in_bytes,
+          &beta, cudnn_output_desc, output_data + i * group_offset_out));
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+template <typename T>
+class CudnnConvGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "It must use GPUPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto filter = ctx.Input<Tensor>("Filter");
+    auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
+    auto input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
+
+    const T* input_data = input->data<T>();
+    const T* output_grad_data = output_grad->data<T>();
+    const T* filter_data = filter->data<T>();
+
+    std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
+    std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
+    int groups = ctx.Attr<int>("groups");
+    int user_workspace_size = ctx.Attr<int>("workspace_size_MB");
+
+    // ------------------- cudnn descriptors ---------------------
+    ScopedTensorDescriptor input_desc;
+    ScopedTensorDescriptor output_grad_desc;
+    ScopedTensorDescriptor input_grad_desc;
+
+    ScopedFilterDescriptor filter_desc;
+    ScopedFilterDescriptor filter_grad_desc;
+    ScopedConvolutionDescriptor conv_desc;
+    DataLayout layout = DataLayout::kNCHW;
+
+    cudnnTensorDescriptor_t cudnn_input_desc =
+        input_desc.descriptor<T>(layout, Dims2Vector(input->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_output_grad_desc =
+        output_grad_desc.descriptor<T>(layout, Dims2Vector(output_grad->dims()),
+                                       groups);
+    cudnnFilterDescriptor_t cudnn_filter_desc =
+        filter_desc.descriptor<T>(layout, Dims2Vector(filter->dims()), groups);
+    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
+    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+    int input_channels = input->dims()[1];
+    int input_height = input->dims()[2];
+    int input_width = input->dims()[3];
+    int output_grad_channels = filter->dims()[0];
+    int output_grad_height = output_grad->dims()[2];
+    int output_grad_width = output_grad->dims()[3];
+
+    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_out =
+        output_grad_channels / groups * output_grad_height * output_grad_width;
+    int group_offset_filter = filter->numel() / groups;
+    // ------------------- cudnn backward algorithm ---------------------
+    cudnnConvolutionBwdDataAlgo_t data_algo;
+    cudnnConvolutionBwdFilterAlgo_t filter_algo;
+    size_t workspace_size_in_bytes = 0, tmp_size = 0;
+    size_t workspace_size_limit = kCONV_CUDNN_WORKSPACE_LIMIT_BYTES;
+    if (user_workspace_size > 0) {
+      workspace_size_limit = user_workspace_size * 1024 * 1024;
+    }
+
+    auto handle = ctx.cuda_device_context().cudnn_handle();
+    if (input_grad) {
+      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
+          layout, Dims2Vector(input_grad->dims()), groups);
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
+              handle, cudnn_filter_desc,
+              // dyDesc: Handle to the previously initialized input differential
+              // tensor descriptor.
+              cudnn_output_grad_desc, cudnn_conv_desc,
+              // dxDesc: Handle to the previously initialized output tensor
+              // descriptor.
+              cudnn_input_grad_desc,
+              CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &data_algo));
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
+              handle, cudnn_filter_desc, cudnn_output_grad_desc,
+              cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+
+    if (filter_grad) {
+      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
+          layout, Dims2Vector(filter_grad->dims()), groups);
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc,
+              CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit, &filter_algo));
+
+      PADDLE_ENFORCE(
+          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
+              handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
+              cudnn_filter_desc, filter_algo, &tmp_size));
+      workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
+    }
+    // ------------------- cudnn conv workspace ---------------------
+    // Already on GPU
+    void* cudnn_workspace = nullptr;
+    platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
+    cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
+    // ------------------- cudnn conv backward data ---------------------
+    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
+    T alpha = 1.0f, beta = 0.0f;
+    if (input_grad) {
+      T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*input_grad);
+      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
+            handle, &alpha, cudnn_filter_desc,
+            filter_data + i * group_offset_filter, cudnn_output_grad_desc,
+            output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
+            cudnn_workspace, workspace_size_in_bytes, &beta,
+            cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
+      }
+    }
+    // ------------------- cudnn conv backward filter ---------------------
+    if (filter_grad) {
+      T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
+      auto t = framework::EigenVector<T>::Flatten(*filter_grad);
+      t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
+          t.constant(static_cast<T>(0));
+      for (int i = 0; i < groups; i++) {
+        PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
+            handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
+            cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
+            cudnn_conv_desc, filter_algo, cudnn_workspace,
+            workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
+            filter_grad_data + i * group_offset_filter));
+      }
+    }
+    // Release the cudnn workspace
+    paddle::memory::Free(gpu, cudnn_workspace);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>);
+REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>);
diff --git a/paddle/operators/conv_shift_op.cc b/paddle/operators/conv_shift_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6156a2d6af9a010240449a7c944ec0caffc85189
--- /dev/null
+++ b/paddle/operators/conv_shift_op.cc
@@ -0,0 +1,204 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_shift_op.h"
+#include "paddle/framework/eigen.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+class ConvShiftOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto y_dims = ctx->GetInputDim("Y");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(y_dims.size(), 2, "Input(Y)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], y_dims[0],
+                      "The 1st dimension of Input(X) and Input(Y) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(y_dims[1] % 2, 1,
+                      "The 2nd dimension of Input(Y) should be odd.");
+    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
+                      "The 2nd dimension of Input(Y) should be less than or "
+                      "equal to the 2nd dimension of Input(X).");
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class ConvShiftGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should be not null.");
+
+    auto x_grad_name = framework::GradVarName("X");
+    if (ctx->HasOutput(x_grad_name)) {
+      auto x_dims = ctx->GetInputDim("X");
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+
+    auto y_grad_name = framework::GradVarName("Y");
+    if (ctx->HasOutput(y_grad_name)) {
+      auto y_dims = ctx->GetInputDim("Y");
+      ctx->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+class ConvShiftOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ConvShiftOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+             "where B is the batch size and M is the data dimension.");
+    AddInput("Y",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape B x N, "
+             "where B is the batch size and N is the data dimension. N must "
+             "be odd.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape B x M, "
+              "i.e., the same shape as X.");
+    AddComment(R"DOC(
+ConvShift Operator.
+
+A layer for circular convolution of two vectors,
+as used in the Neural Turing Machine: https://arxiv.org/abs/1410.5401
+
+The equation is:
+
+  \f[
+      Out[i] = \sum_{j=-(N-1)/2}^{(N-1)/2} X_{i+j} * Y_{j}
+  \f]
+
+where X's index is computed modulo M, and b's index is computed modulo N.
+
+Both of the input `X` and `Y` can carry LoD (Level of Details) information.
+However, the output only shares the LoD information with input `X`.
+)DOC");
+  }
+};
+
+template <typename T>
+class ConvShiftKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *Out = context.Output<Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto out = EigenMatrix<T>::From(*Out);
+    out.setZero();
+
+    size_t batch_size = X->dims()[0];
+    size_t x_width = X->dims()[1];
+    size_t y_width = Y->dims()[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    for (size_t k = 0; k < batch_size; ++k) {
+      for (size_t i = 0; i < x_width; ++i) {
+        for (size_t j = 0; j < y_width; ++j) {
+          int index = (i + j - y_half_width + x_width) % x_width;
+          out(k, i) += x(k, index) * y(k, j);
+        }
+      }
+    }
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::CPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *X = context.Input<Tensor>("X");
+    auto *Y = context.Input<Tensor>("Y");
+    auto *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    auto *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    auto x = EigenMatrix<T>::From(*X);
+    auto y = EigenMatrix<T>::From(*Y);
+    auto dout = EigenMatrix<T>::From(*dOut);
+
+    auto x_dims = X->dims();
+    auto y_dims = Y->dims();
+    size_t batch_size = x_dims[0];
+    size_t x_width = x_dims[1];
+    size_t y_width = y_dims[1];
+    size_t y_half_width = (y_width - 1) / 2;
+
+    // The below trades code duplication for efficiency (keeping the if
+    // statement outside of the loop).
+    if (dX) {
+      dX->mutable_data<T>(context.GetPlace());
+      auto dx = EigenMatrix<T>::From(*dX);
+      dx.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dx(k, index) += dout(k, i) * y(k, j);
+          }
+        }
+      }
+    }
+
+    if (dY) {
+      dY->mutable_data<T>(context.GetPlace());
+      auto dy = EigenMatrix<T>::From(*dY);
+      dy.setZero();
+      for (size_t k = 0; k < batch_size; ++k) {
+        for (size_t i = 0; i < x_width; ++i) {
+          for (size_t j = 0; j < y_width; ++j) {
+            int index = (i + j - y_half_width + x_width) % x_width;
+            dy(k, j) += x(k, index) * dout(k, i);
+          }
+        }
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(conv_shift, ops::ConvShiftOp, ops::ConvShiftOpMaker,
+            conv_shift_grad, ops::ConvShiftGradOp);
+REGISTER_OP_CPU_KERNEL(conv_shift,
+                       ops::ConvShiftKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/conv_shift_op.cu b/paddle/operators/conv_shift_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..145e966fe9caa68f7485bb258fa78fd34bfd4c04
--- /dev/null
+++ b/paddle/operators/conv_shift_op.cu
@@ -0,0 +1,194 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/conv_shift_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+namespace {
+
+inline int div_up(int x, int y) { return (x + y - 1) / y; }
+
+// Some notes on the design:
+//
+// Each thread is responsible for computing a single output out[k, i].
+// Thread blocks are based on tiles of x with height 1 in the batch dimension.
+//
+// This design is based on the typical use case where the filter
+// y is fairly small. For large y, it would probably be more efficient
+// to also tile across y.
+template <typename T>
+__global__ void conv_shift_forward(const T *x, const T *y, T *out, int x_width,
+                                   int y_width, int y_half_width,
+                                   int batch_size) {
+  extern __shared__ T mem[];
+
+  int tx = threadIdx.x;
+  int i = blockIdx.x * blockDim.x + tx;  // global x index
+  int k = blockIdx.y;                    // batch index
+
+  // Check if we are in a boundary block with fewer x's to process than
+  // blockDim.x.
+  int num_x =
+      (blockIdx.x == gridDim.x - 1) ? (x_width % blockDim.x) : blockDim.x;
+
+  T *sx = mem;
+  T *sx_pad = &mem[num_x];
+  T *sy = &mem[blockDim.x + y_width];
+
+  // Collaboratively load y[k, :] and length-y padding of x into shared memory.
+  int pad_start = blockIdx.x * blockDim.x + num_x + x_width - y_half_width;
+  for (int j = tx; j < y_width; j += blockDim.x) {
+    sy[j] = y[k * y_width + j];
+    sx_pad[j] = x[k * x_width + (pad_start + j) % x_width];
+  }
+
+  // Load a cyclically shifted slice of x into shared memory.
+  if (tx < num_x) {
+    int load_i = (i - y_half_width + x_width) % x_width;
+    sx[tx] = x[k * x_width + load_i];
+  } else {
+    return;
+  }
+  __syncthreads();
+
+  // Compute dot product of sx[tx:tx + y_width] and sy.
+  T sum = 0;
+  for (int j = 0; j < y_width; ++j) {
+    sum += sx[tx + j] * sy[j];
+  }
+
+  // Save to out[k, i].
+  out[k * x_width + i] = sum;
+}
+
+// Compute x gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void conv_shift_dx(const T *dout, const T *y, T *dx, int x_width,
+                              int y_width, int y_half_width, int batch_size) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dx[k * x_width + index],
+              dout[k * x_width + i] * y[k * y_width + j]);
+  }
+}
+
+// Compute y gradient - initial naive implementation with atomic add.
+template <typename T>
+__global__ void conv_shift_dy(const T *x, const T *dout, T *dy, int x_width,
+                              int y_width, int y_half_width, int batch_size) {
+  int i = blockIdx.x * blockDim.x + threadIdx.x;  // x index
+  int j = blockIdx.y;                             // y index
+  int k = blockIdx.z;                             // batch index
+
+  if (i < x_width) {
+    int index = (i + j - y_half_width + x_width) % x_width;
+    atomicAdd(&dy[k * y_width + j],
+              x[k * x_width + index] * dout[k * x_width + i]);
+  }
+}
+}  // namespace
+
+template <typename T>
+class ConvShiftKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    Tensor *Out = context.Output<Tensor>("Out");
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    T *out_data = Out->mutable_data<T>(context.GetPlace());
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    const int x_per_block = 256;
+    int num_x_blocks = div_up(x_width, x_per_block);
+    int mem_per_block = (x_per_block + 2 * y_width) * sizeof(T);
+
+    dim3 grid_dim(num_x_blocks, batch_size);
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      context.device_context())
+                      .stream();
+
+    conv_shift_forward<T><<<grid_dim, x_per_block, mem_per_block, stream>>>(
+        x_data, y_data, out_data, x_width, y_width, y_half_width, batch_size);
+  }
+};
+
+template <typename T>
+class ConvShiftGradKernel<platform::GPUPlace, T>
+    : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const Tensor *X = context.Input<Tensor>("X");
+    const Tensor *Y = context.Input<Tensor>("Y");
+    const Tensor *dOut = context.Input<Tensor>(framework::GradVarName("Out"));
+    const T *x_data = X->data<T>();
+    const T *y_data = Y->data<T>();
+    const T *dout_data = dOut->data<T>();
+
+    Tensor *dX = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor *dY = context.Output<Tensor>(framework::GradVarName("Y"));
+
+    int batch_size = X->dims()[0];
+    int x_width = X->dims()[1];
+    int y_width = Y->dims()[1];
+    int y_half_width = (y_width - 1) / 2;
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      context.device_context())
+                      .stream();
+
+    const int x_per_block = 256;
+    int num_x_blocks = div_up(x_width, x_per_block);
+    dim3 grid_dim(num_x_blocks, y_width, batch_size);
+
+    if (dX) {
+      T *dx_data = dX->mutable_data<T>(context.GetPlace());
+      cudaMemsetAsync(dx_data, 0, dX->numel() * sizeof(T), stream);
+      conv_shift_dx<T><<<grid_dim, x_per_block, 0, stream>>>(
+          dout_data, y_data, dx_data, x_width, y_width, y_half_width,
+          batch_size);
+    }
+    if (dY) {
+      T *dy_data = dY->mutable_data<T>(context.GetPlace());
+      cudaMemsetAsync(dy_data, 0, dY->numel() * sizeof(T), stream);
+      conv_shift_dy<T><<<grid_dim, x_per_block, 0, stream>>>(
+          x_data, dout_data, dy_data, x_width, y_width, y_half_width,
+          batch_size);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(conv_shift,
+                       ops::ConvShiftKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    conv_shift_grad,
+    ops::ConvShiftGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/conv_shift_op.h b/paddle/operators/conv_shift_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a160b0f1696c70868fc48d219b38cde2018e8a3
--- /dev/null
+++ b/paddle/operators/conv_shift_op.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class ConvShiftKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+
+template <typename Place, typename T>
+class ConvShiftGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override;
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/cos_sim_op.cc b/paddle/operators/cos_sim_op.cc
index 040546f1a6fe1af6d17a5e363a11d27de88d03c2..55f69fb03ad69c94dc4ebb8edd651d84e06a5f46 100644
--- a/paddle/operators/cos_sim_op.cc
+++ b/paddle/operators/cos_sim_op.cc
@@ -23,8 +23,7 @@ class CosSimOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     // notnull check
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of CosSimOp should not be null.");
@@ -97,8 +96,7 @@ class CosSimOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     // notnull check
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) must not be null.");
diff --git a/paddle/operators/cos_sim_op.h b/paddle/operators/cos_sim_op.h
index bcf6f758cae561a2e22f5be6c7a242647ef1c144..68c56f531f941e1b8f66ac7ba6bf318881642c4f 100644
--- a/paddle/operators/cos_sim_op.h
+++ b/paddle/operators/cos_sim_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class CosSimKernel : public framework::OpKernel {
+class CosSimKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
@@ -67,7 +67,7 @@ class CosSimKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class CosSimGradKernel : public framework::OpKernel {
+class CosSimGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     // get Tensor
diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc
index 9b2305e90e85a6f39d4c584a3251b25f67e81aca..ed78e9e3a3a49b7ff0990b8d13cfe2dae594b722 100644
--- a/paddle/operators/crop_op.cc
+++ b/paddle/operators/crop_op.cc
@@ -24,8 +24,7 @@ class CropOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of CropOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -60,7 +59,8 @@ class CropOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input should be a k-D tensor(k > 0 and k < 7)");
     AddInput("Y",
              "The input used as reference for cropping"
-             " with the same dimension as X. ");
+             " with the same dimension as X. ")
+        .AsDispensable();
     AddOutput("Out",
               "The output of crop op "
               "with the same dimension as X.");
@@ -114,8 +114,7 @@ class CropOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h
index ac3aeaf41e206c1deb74c7022c36f02c4777a84b..2e72583d68d0acf0e2f5044637dba55de3b57209 100644
--- a/paddle/operators/crop_op.h
+++ b/paddle/operators/crop_op.h
@@ -27,7 +27,7 @@ using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
 using framework::Tensor;
 
 template <typename T>
-class CropKernel : public framework::OpKernel {
+class CropKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -69,7 +69,7 @@ void CropGradFunction(const framework::ExecutionContext& context) {
 }
 
 template <typename Place, typename T>
-class CropGradKernel : public framework::OpKernel {
+class CropGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 26fc9b51c44d21d92851030449e116538f937846..a865991db3111d2a7cec9f7731b3c34876864299 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -21,8 +21,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null.");
@@ -34,27 +33,33 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
                       "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "If Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "If Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
     ctx->SetOutputDim("Y", {x_dims[0], 1});
     ctx->ShareLoD("X", /*->*/ "Y");
   }
+
+ protected:
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class CrossEntropyGradientOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
@@ -76,17 +81,24 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                       "be equal.");
     PADDLE_ENFORCE_EQ(dy_dims[1], 1,
                       "The 2nd dimension of Input(Y@Grad) should be 1.");
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
-                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "When Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(label_dims[1], 1,
-                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "When Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
   }
+
+ protected:
+  // CrossEntropy's data type just determined by "X"
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -103,15 +115,15 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
         "Label",
         "(Tensor, default Tensor<int>), the ground truth which is "
         "a 2-D tensor. "
-        "When softLabel is set to false, `Label` is a Tensor<int> with shape "
+        "When soft_label is set to false, `Label` is a Tensor<int> with shape "
         "[N x 1]. "
-        "When softLabel is set to true, `Label` is a Tensor<float/double> "
+        "When soft_label is set to true, `Label` is a Tensor<float/double> "
         "with shape [N x K].");
     AddOutput("Y",
               "(Tensor, default Tensor<float>), a 2-D tensor "
               "with shape [N x 1]. The cross entropy loss.");
     AddAttr<bool>(
-        "softLabel",
+        "soft_label",
         "(bool, default false), a flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
@@ -121,12 +133,12 @@ CrossEntropy Operator.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:
-    softLabel = false, Label[i, 0] indicates the class index for sample i:
+    soft_label = false, Label[i, 0] indicates the class index for sample i:
 
                 Y[i] = -log(X[i, Label[i]])
 
 2) Soft-label cross-entropy:
-    softLabel = true, Label[i, j] indicates the soft label of class j
+    soft_label = true, Label[i, j] indicates the soft label of class j
     for sample i:
 
                 Y[i] = \sum_j{-Label[i, j] * log(X[i, j])}
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 1cfeb7a53b047541322ac53c5b7249e660039d5c..c492dddb09a41e3731a211b4fa083e57ad780f42 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -18,14 +18,6 @@ namespace paddle {
 namespace operators {
 
 namespace {
-// TODO(qingqing): make zero setting a common function.
-template <typename T>
-__global__ void Zero(T* X, const int N) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
-       i += blockDim.x * gridDim.x) {
-    X[i] = 0.0;
-  }
-}
 
 template <typename T>
 __global__ void CrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
@@ -53,7 +45,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* dX, const T* dY, const T* X,
 }  // namespace
 
 template <typename T>
-class CrossEntropyOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -64,12 +56,12 @@ class CrossEntropyOpCUDAKernel : public framework::OpKernel {
     y->mutable_data<T>(ctx.GetPlace());
 
     math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        ctx, y, x, label, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, label, ctx.Attr<bool>("soft_label"));
   }
 };
 
 template <typename T>
-class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
+class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -91,7 +83,7 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
 
-    if (ctx.Attr<bool>("softLabel")) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto* label_data = label->data<T>();
       SoftCrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
@@ -99,11 +91,8 @@ class CrossEntropyGradientOpCUDAKernel : public framework::OpKernel {
                               .stream()>>>(dx_data, dy_data, x_data, label_data,
                                            batch_size, class_num);
     } else {
-      Zero<T><<<grid, block, 0,
-                reinterpret_cast<const platform::CUDADeviceContext&>(
-                    ctx.device_context())
-                    .stream()>>>(dx_data, batch_size * class_num);
-
+      math::SetConstant<platform::GPUPlace, T> functor;
+      functor(ctx.device_context(), dx, 0);
       auto* label_data = label->data<int>();
       grid = (batch_size + block - 1) / block;
       CrossEntropyGradientKernel<T><<<
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index 1f67461d3fadb1a979832ad049d4e0098256b834..42f282103b5609e3c987fc4a83113f86532f74d6 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/math/cross_entropy.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class CrossEntropyOpKernel : public framework::OpKernel {
+class CrossEntropyOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -37,12 +38,12 @@ class CrossEntropyOpKernel : public framework::OpKernel {
     y->mutable_data<T>(ctx.GetPlace());
 
     math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        ctx, y, x, labels, ctx.Attr<bool>("softLabel"));
+        ctx.device_context(), y, x, labels, ctx.Attr<bool>("soft_label"));
   }
 };
 
 template <typename T>
-class CrossEntropyGradientOpKernel : public framework::OpKernel {
+class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -54,7 +55,7 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
     T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
 
     int class_num = x->dims()[1];
-    if (ctx.Attr<bool>("softLabel")) {
+    if (ctx.Attr<bool>("soft_label")) {
       auto x_mat = EigenMatrix<T>::From(*x);
       auto dy_mat = EigenMatrix<T>::From(*dy);
       auto lbl_mat = EigenMatrix<T>::From(*label);
@@ -69,8 +70,8 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel {
       const T* x_data = x->data<T>();
       const int* label_data = label->data<int>();
 
-      // TODO(qingqing): make zero setting a common function.
-      memset(dx_data, 0, sizeof(T) * batch_size * class_num);
+      math::SetConstant<platform::CPUPlace, T> functor;
+      functor(ctx.device_context(), dx, 0);
 
       for (int i = 0; i < batch_size; ++i) {
         PADDLE_ASSERT(label_data[i] >= 0 || label_data[i] < class_num);
diff --git a/paddle/operators/decayed_adagrad_op.cc b/paddle/operators/decayed_adagrad_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..17b394aa07cb0c7ca6e085b61590ff052221b22c
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DecayedAdagradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput("LearningRate"),
+        "Input(LearningRate) of DecayedAdagradOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of DecayedAdagradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(MomentOut) of DecayedAdagradOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "LearningRate should have one element");
+    auto param_dims = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Grad"),
+                      "Param and Grad input of DecayedAdagradOp should have "
+                      "the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dims, ctx->GetInputDim("Moment"),
+                      "Param and Moment input of DecayedAdagradOp should have "
+                      "the same dimension.");
+
+    ctx->SetOutputDim("ParamOut", param_dims);
+    ctx->SetOutputDim("MomentOut", param_dims);
+  }
+};
+
+class DecayedAdagradOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  DecayedAdagradOpMaker(framework::OpProto *proto,
+                        framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param", "(Tensor) Input parameter");
+    AddInput("Grad", "(Tensor) Input gradient");
+    AddInput("Moment", "(Tensor) Second moment");
+    AddInput("LearningRate", "(Tensor) Learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter");
+    AddOutput("MomentOut", "(Tensor) Output second moment");
+
+    AddAttr<float>("decay",
+                   "(float, default 0.95) "
+                   "Discounting factor for coming gradient")
+        .SetDefault(0.95);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability")
+        .SetDefault(1.0e-6f);
+    AddComment(R"DOC(
+
+Decayed Adagrad
+
+moment_out = decay * moment + (1 - decay) * grad * grad
+param_out = param - learning_rate * grad / (sqrt(moment_out) + epsilon)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(decayed_adagrad, ops::DecayedAdagradOp,
+                             ops::DecayedAdagradOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/decayed_adagrad_op.cu b/paddle/operators/decayed_adagrad_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6fce77fe4ec6b76cb7b0259aab6a3d55d2edb36c
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/decayed_adagrad_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    decayed_adagrad,
+    ops::DecayedAdagradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/decayed_adagrad_op.h b/paddle/operators/decayed_adagrad_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fe0fc5acd66c9824a864618b69097c5c063ea3f
--- /dev/null
+++ b/paddle/operators/decayed_adagrad_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class DecayedAdagradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out_tensor = ctx.Output<framework::Tensor>("ParamOut");
+    auto moment_out_tensor = ctx.Output<framework::Tensor>("MomentOut");
+
+    param_out_tensor->mutable_data<T>(ctx.GetPlace());
+    moment_out_tensor->mutable_data<T>(ctx.GetPlace());
+
+    float decay = ctx.Attr<float>("decay");
+    float epsilon = ctx.Attr<float>("epsilon");
+
+    auto param = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Param"));
+    auto grad = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Grad"));
+    auto moment = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("Moment"));
+    auto lr = framework::EigenVector<T>::Flatten(
+        *ctx.Input<framework::Tensor>("LearningRate"));
+
+    auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
+    auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    moment_out.device(place) = decay * moment + (1 - decay) * grad * grad;
+    Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
+    param_out.device(place) =
+        param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/strided_memcpy.h b/paddle/operators/detail/strided_memcpy.h
index b165224b37fb091c094a823179256c3dd40a37c9..068c82f399316a1587d7322d8dab75823656800e 100644
--- a/paddle/operators/detail/strided_memcpy.h
+++ b/paddle/operators/detail/strided_memcpy.h
@@ -34,7 +34,7 @@ struct StridedMemcpyFunctor<T, 1> {
       auto& cpu_place = boost::get<platform::CPUPlace>(place);
       memory::Copy(cpu_place, dst, cpu_place, src, sizeof(T) * dst_dim.head);
     } else {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
       auto& gpu_place = boost::get<platform::GPUPlace>(place);
       auto& cuda_ctx =
           reinterpret_cast<const platform::CUDADeviceContext&>(dev_ctx);
diff --git a/paddle/operators/dropout_op.cc b/paddle/operators/dropout_op.cc
index a669b5cf00f1f4ad351486e2977bf8a76aa5bf62..29858c90832bf116d07e43825eda5775a94beafb 100644
--- a/paddle/operators/dropout_op.cc
+++ b/paddle/operators/dropout_op.cc
@@ -23,8 +23,7 @@ class DropoutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE_GE(ctx->Attrs().Get<float>("dropout_prob"), 0);
     PADDLE_ENFORCE_LE(ctx->Attrs().Get<float>("dropout_prob"), 1);
@@ -69,8 +68,7 @@ class DropoutOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE_EQ(ctx->Attrs().Get<bool>("is_training"), 1,
                       "GradOp is only callable when is_training is true");
 
diff --git a/paddle/operators/dropout_op.cu b/paddle/operators/dropout_op.cu
index a04e4a22cc09d4e8106a528e490ccf8e90681c08..30c769000f2b98c69eaa78a4c139630dd0956386 100644
--- a/paddle/operators/dropout_op.cu
+++ b/paddle/operators/dropout_op.cu
@@ -47,7 +47,7 @@ struct MaskGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename Place, typename T, typename AttrType>
-class GPUDropoutKernel : public framework::OpKernel {
+class GPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
diff --git a/paddle/operators/dropout_op.h b/paddle/operators/dropout_op.h
index d57f64afcb3558aeea6aed23fae06866e9af874a..745525fe81dadb22cbb64d66203f5a75608d3718 100644
--- a/paddle/operators/dropout_op.h
+++ b/paddle/operators/dropout_op.h
@@ -26,7 +26,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T, typename AttrType>
-class CPUDropoutKernel : public framework::OpKernel {
+class CPUDropoutKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -62,7 +62,7 @@ class CPUDropoutKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class DropoutGradKernel : public framework::OpKernel {
+class DropoutGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(context.Attr<bool>("is_training"),
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0b06ac1dc305bc899f9abaafcc980a6150ecda9
--- /dev/null
+++ b/paddle/operators/dynamic_recurrent_op.cc
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve .
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/dynamic_recurrent_op.h"
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Scope;
+using framework::TensorArray;
+using framework::LoDTensor;
+using framework::Variable;
+using framework::OperatorBase;
+using framework::DySeqMetaBatch;
+
+namespace detail {
+
+inline void CreateVariables(Scope& scope,
+                            const std::vector<std::string>& var_names) {
+  for (const auto& name : var_names) {
+    scope.Var(name);
+  }
+}
+
+/*
+ * The inputs with sequence should be reordered when they are split, so the
+ * boot_states should be reordered in the same order.
+ *
+ * NOTE This may require that the `pre_state` of the first time step should just
+ * copy the `boot_state` rather than reference it, for that the content should
+ * be reordered, but the RNN op should not change the `boot_state` as an input
+ * variable's content.
+ */
+inline void ReorderInitialState(const DySeqMetaBatch& metas,
+                                const LoDTensor& boot_state, LoDTensor* tensor,
+                                const platform::Place& dst_place) {
+  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
+    auto slice = tensor->Slice(seq_id, seq_id + 1);
+    auto boot_slice =
+        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+    // TODO(superjom) pass in device context as an argument
+    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
+  }
+}
+
+inline void RestoreInitialState(const DySeqMetaBatch& metas,
+                                const LoDTensor& tensor, LoDTensor* boot_state,
+                                const platform::Place& dst_place) {
+  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
+    auto slice = tensor.Slice(seq_id, seq_id + 1);
+    auto boot_slice =
+        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
+    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
+  }
+}
+
+}  // namespace detail
+
+// Implementation for forward propagation.
+template <>
+void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
+    const framework::Scope& scope, const framework::OperatorBase& op,
+    const platform::DeviceContext& dev_ctx) {
+  SetComputeMode(ComputeMode::kForward);
+  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
+  SplitInputs();
+  CreateScopes();
+  WriteStepInputs();
+  InitStates();
+  WriteStepOutputs();
+  RunSteps();
+  ConcatOutputs();
+}
+
+// Implementation for backward propagation.
+template <>
+void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
+    const framework::Scope& scope, const framework::OperatorBase& op,
+    const platform::DeviceContext& dev_ctx) {
+  SetComputeMode(ComputeMode::kBackward);
+  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
+  SplitInputs();
+  WriteStepInputs();
+  InitStates();
+  WriteStepOutputs();
+  RunSteps();
+  // copy boot-states' gradients back.
+  for (const auto& state : arg_.states) {
+    ExportInitialStateGradient(state);
+  }
+
+  ConcatOutputs();
+}
+
+void RNNAlgorithm::SplitInputs() {
+  // TODO(superjom) make level a config
+  // TODO(superjom) check all the inputs has the same LoD
+  int level = 0;
+  for (const auto& item : cache_.inputs) {
+    const auto& var = item.second;
+    const auto& tensor = var->Get<LoDTensor>();
+    TensorArray& ta = step_inputs_[item.first];
+
+    dy_seq_metas_[item.first] =
+        ta.Unpack(tensor, level, true /*length_descend*/);
+
+    if (cache_.num_steps) {
+      PADDLE_ENFORCE_EQ(ta.size(), cache_.num_steps,
+                        "inputs should have the same steps");
+    } else {
+      cache_.num_steps = ta.size();
+    }
+  }
+}
+
+void RNNAlgorithm::WriteStepInputs() {
+  for (const auto& item : cache_.inputs) {
+    auto ta_it = step_inputs_.find(item.first);
+    PADDLE_ENFORCE(ta_it != step_inputs_.end(),
+                   "step_inputs_ not compatible with memory set");
+    TensorArray& ta = ta_it->second;
+    for (size_t step = 0; step < ta.size(); step++) {
+      auto tensor = ta.Read(step);
+      auto& step_scope = cache_.GetScope(step);
+      Variable* var = step_scope.FindVar(item.first);
+      if (var == nullptr) {
+        var = step_scope.Var(item.first);
+      }
+      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
+    }
+  }
+}
+
+void RNNAlgorithm::WriteStepOutputs() {
+  // initialize step outputs
+  for (const auto& item : cache_.outputs) {
+    step_outputs_.emplace(item.first, TensorArray());
+  }
+  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
+}
+
+void RNNAlgorithm::CreateScopes() {
+  PADDLE_ENFORCE_GT(cache_.num_steps, 0);
+  // resize scopes
+  size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
+  for (size_t i = 0; i < num_scopes_need_create; i++) {
+    cache_.scopes->emplace_back(&cache_.scope->NewScope());
+  }
+
+  // init temporary inputs
+  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
+  std::vector<std::string> states;
+  std::vector<std::string> ex_states;
+  std::vector<std::string> step_unit_outputs;
+  std::transform(arg_.states.begin(), arg_.states.end(),
+                 std::back_inserter(states),
+                 [](const rnn::StateAttr& m) { return m.var; });
+  std::transform(arg_.states.begin(), arg_.states.end(),
+                 std::back_inserter(ex_states),
+                 [](const rnn::StateAttr& m) { return m.pre_var; });
+  for (const auto& item : step_unit_->Outputs()) {
+    for (const auto& var : item.second) {
+      step_unit_outputs.push_back(var);
+    }
+  }
+
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    auto& scope = cache_.GetScope(step);
+    detail::CreateVariables(scope, arg_.inlinks);
+    detail::CreateVariables(scope, arg_.outlinks);
+    detail::CreateVariables(scope, states);
+    detail::CreateVariables(scope, ex_states);
+    detail::CreateVariables(scope, step_unit_outputs);
+  }
+}
+
+void RNNAlgorithm::ConcatOutputs() {
+  // TODO(superjom) transform this to a config
+  int level = 0;
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    auto& scope = cache_.GetScope(step);
+    for (auto& item : step_outputs_) {
+      auto* var = scope.FindVar(item.first);
+      PADDLE_ENFORCE_NOT_NULL(var);
+      auto* tensor = var->GetMutable<LoDTensor>();
+      tensor->mutable_data<value_type>(platform::CPUPlace());
+      item.second.WriteShared(step, *tensor);
+    }
+  }
+  // the inputs' lods should be the same, so randomly get one lod.
+  const auto& some_lod =
+      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  for (auto& item : step_outputs_) {
+    auto tensor = item.second.Pack(level, some_meta, some_lod);
+    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
+    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
+  }
+}
+
+void RNNAlgorithm::RunSteps() {
+  if (IsBackward()) {
+    // call stepnet in all the time steps reversely
+    for (int step = cache_.num_steps - 1; step >= 0; step--) {
+      auto& step_scope = cache_.GetScope(step);
+      step_unit_->Run(step_scope, *cache_.dev_ctx);
+    }
+  } else {
+    for (size_t step = 0; step < cache_.num_steps; step++) {
+      auto& step_scope = cache_.GetScope(step);
+      step_unit_->Run(step_scope, *cache_.dev_ctx);
+    }
+  }
+}
+
+void RNNAlgorithm::InitStates() {
+  for (size_t step = 0; step < cache_.num_steps; step++) {
+    for (const auto& state : arg_.states) {
+      CreateState(state, step);
+      LinkState(state, step);
+    }
+  }
+}
+
+void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
+  auto& scope = cache_.GetScope(step);
+  auto& state = *cache_.GetTensor(scope, state_attr.var);
+  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
+
+  size_t num_instances =
+      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+  auto dims = boot_state.dims();
+  dims[0] = num_instances;
+
+  state.Resize(dims);
+  state.mutable_data<value_type>(platform::CPUPlace());
+  states_[state_attr.var].WriteShared(step, state);
+}
+
+void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
+  auto& scope = cache_.GetScope(step);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+
+  // process the first state's boot-state(the 0-step in forward mode or the
+  // last step in backward mode)
+  // Only forward mode need to link the boot-state to the `pre-state` in first
+  // time step. In backward mode, need to copy the gradient of `pre-state` in
+  // first time step to the gradient of `boot-state`.
+  if (step == 0 && IsForward()) {
+    LinkInitialState(state);
+  } else {
+    size_t num_instances =
+        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
+    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
+    // shink and share from previous state
+    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
+    state_pre.ShareDataWith(shrinked_pre_state);
+  }
+}
+
+void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
+  // all the step_inputs' metas should be the same, just randomly select one
+  // and get the dyseq meta.
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  auto& scope = cache_.GetScope(0);
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
+  pre_state->mutable_data<float>(platform::CPUPlace());
+  // allocate state
+  state_pre.Resize(pre_state->dims());
+  state_pre.mutable_data<value_type>(platform::CPUPlace());
+  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
+                              pre_state->place());
+}
+
+void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
+  // all the step_inputs' metas should be the same, just randomly select one
+  // and get the dyseq meta.
+  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
+  auto& scope = cache_.GetScope(0);
+
+  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
+  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
+  pre_state.Resize(state_pre.dims());
+  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
+                              pre_state.place());
+}
+
+void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
+                                  const paddle::framework::OperatorBase& op,
+                                  const paddle::framework::Scope& scope,
+                                  platform::DeviceContext const* dev_ctx,
+                                  rnn::Argument* arg) {
+  this->scope = &scope;
+  InitArgument(name, op, arg);
+  CacheScopes(scope, *arg);
+  CacheInlinks(scope, arg->inlinks);
+  CacheOutlinks(scope, arg->outlinks);
+  this->dev_ctx = dev_ctx;
+}
+
+void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
+                                          const OperatorBase& op,
+                                          rnn::Argument* arg) {
+  rnn::InitArgument(name, arg, op, false /*is_grad*/);
+}
+
+void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
+                                         const rnn::Argument& arg) {
+  auto scopes_var = scope.FindVar(arg.step_scopes);
+  PADDLE_ENFORCE(scopes_var != nullptr,
+                 "the step_scopes output argument [%s] should be created first "
+                 "by framework.",
+                 arg.step_scopes);
+  this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
+}
+
+void RNNAlgorithm::ArgCache::CacheInlinks(
+    const Scope& scope, const std::vector<std::string>& names) {
+  for (auto name : names) {
+    auto* var = GetVariable(scope, name);
+    inputs[name] = var;
+  }
+}
+
+void RNNAlgorithm::ArgCache::CacheOutlinks(
+    const Scope& scope, const std::vector<std::string>& names) {
+  for (auto name : names) {
+    auto* var = GetVariable(scope, name);
+    outputs[name] = var;
+  }
+}
+
+Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
+                                              const std::string& name) {
+  auto* var = scope.FindVar(name);
+  PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
+  return var;
+}
+
+LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
+                                             const std::string& name) {
+  auto* var = GetVariable(scope, name);
+  return var->GetMutable<LoDTensor>();
+}
+
+const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
+    {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs",
+                       "states", "ex_states", "initial_states"},
+     rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
+                       "inputs@GRAD", "states", "ex_states",
+                       "initial_states@GRAD"}}};
+
+void DynamicRecurrentOp::Run(const framework::Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
+      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
+}
+
+void DynamicRecurrentGradientOp::Run(
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
+  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
+      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
+}
+
+class DynamicRecurrentOpProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
+                                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    const auto& name =
+        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
+    // inputs and outputs stored in proto
+    AddInput(name.inlinks,
+             "the inputs that need to be segmented for each step.")
+        .AsDuplicable();
+    AddInput(name.initial_states, "variables to initialize states.")
+        .AsDuplicable();
+
+    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
+        .AsDuplicable();
+    AddOutput(name.step_scopes, "step scopes");
+
+    // Attributes stored in AttributeMap
+    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
+    AddAttr<std::vector<std::string>>(name.states, "names of states");
+
+    AddComment("This is a RNN operator for varience-length sequences.");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
+            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
+            dynamic_recurrent_grad,
+            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5b0548c3a44c9f58838ecc567ee41a587883c26a
--- /dev/null
+++ b/paddle/operators/dynamic_recurrent_op.h
@@ -0,0 +1,233 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#ifdef PADDLE_WITH_TESTING
+#include "gtest/gtest.h"
+#endif
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor_array.h"
+#include "paddle/framework/variable.h"
+#include "paddle/operators/rnn/recurrent_op_utils.h"
+
+namespace paddle {
+namespace operators {
+
+class RNNAlgorithm {
+ public:
+  enum ComputeMode { kForward = 0, kBackward = 1 };
+  static const std::array<rnn::ArgumentName, 2> kArgNames;
+  using value_type = float;
+
+  /*
+   * Different `Run` method for forward and backward, `_` is just for template
+   * specifialization.
+   */
+  template <ComputeMode _>
+  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
+           const platform::DeviceContext& dev_ctx);
+  /*
+   * Split the inputs(LoDTensors) to segments for each time step.
+   */
+  void SplitInputs();
+
+  /*
+   * Create step-scopes to store temporary outputs in each time steps.
+   */
+  void CreateScopes();
+
+  /*
+   * Link TensorArray steps to the corresponding variables located in
+   * step-scopes.
+   */
+  void WriteStepInputs();
+
+  /*
+   * Write output of each step to the corresponding TensorArray.
+   */
+  void WriteStepOutputs();
+
+  /*
+   * Initialize the states, each state will have a corresponding pre-state,
+   * which share the memory with the state in the previous time state. The
+   * pre-state in the first time step will be initialized with an zero tensor or
+   * a tensor in parent scope if is provided.
+   */
+  void InitStates();
+
+  /*
+   * Create state variables for each time step.
+   */
+  void CreateState(const rnn::StateAttr& state, size_t step);
+
+  /*
+   * Link pre-state variable in current scope to the state variable in the
+   * previous time step (scope) by reference.
+   */
+  void LinkState(const rnn::StateAttr& state, size_t step);
+
+  /*
+   * Link the pre-state of the first time step to the `boot-state` in parent's
+   * scope.
+   */
+  void LinkInitialState(const rnn::StateAttr& state);
+
+  /*
+   * Copy the gradient from `pre-state` in the first step-scope to the
+   * `boot-state` in parent's scope.
+   */
+  void ExportInitialStateGradient(const rnn::StateAttr& state);
+
+  /*
+   * Calculate time steps.
+   */
+  void RunSteps();
+
+  /*
+   * Concatenate outputs in each time step and generate a LoDTensor.
+   */
+  void ConcatOutputs();
+
+  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
+  bool IsForward() const { return mode_ == ComputeMode::kForward; }
+  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
+
+  /*
+   * set a step unit that is created according to a RecurrentOp's step unit.
+   */
+  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
+    PADDLE_ENFORCE_NOT_NULL(step_unit);
+    step_unit_ = std::move(step_unit);
+  }
+  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
+
+  const framework::TensorArray& state(const std::string& name) const {
+    auto it = states_.find(name);
+    PADDLE_ENFORCE(it != states_.end());
+    return it->second;
+  }
+  const framework::TensorArray& step_input(const std::string& name) const {
+    auto it = step_inputs_.find(name);
+    PADDLE_ENFORCE(it != step_inputs_.end());
+    return it->second;
+  }
+  const framework::TensorArray& step_output(const std::string& name) const {
+    auto it = step_outputs_.find(name);
+    PADDLE_ENFORCE(it != step_outputs_.end());
+    return it->second;
+  }
+
+ protected:
+  struct ArgCache {
+    framework::Scope const* scope;
+    std::vector<framework::Scope*>* scopes;
+    std::map<std::string, framework::Variable*> inputs;
+    std::map<std::string, framework::Variable*> outputs;
+    platform::DeviceContext const* dev_ctx;
+
+    size_t num_steps{0};
+
+    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
+              const framework::Scope& scope,
+              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
+
+    framework::Scope& GetScope(size_t index) {
+      PADDLE_ENFORCE_LT(index, num_steps);
+      return *scopes->at(index);
+    }
+
+    framework::LoDTensor* GetTensor(const framework::Scope& scope,
+                                    const std::string& name);
+
+   private:
+    void InitArgument(const rnn::ArgumentName& name,
+                      const framework::OperatorBase& op, rnn::Argument* arg);
+    void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
+    void CacheInlinks(const framework::Scope& scope,
+                      const std::vector<std::string>& names);
+    void CacheOutlinks(const framework::Scope& scope,
+                       const std::vector<std::string>& names);
+    framework::Variable* GetVariable(const framework::Scope& scope,
+                                     const std::string& name);
+  };
+
+ private:
+  std::unique_ptr<framework::OperatorBase> step_unit_;
+  std::map<std::string, framework::TensorArray> states_;
+  std::map<std::string, framework::TensorArray> step_inputs_;
+  std::map<std::string, framework::TensorArray> step_outputs_;
+  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
+  rnn::Argument arg_;
+  ArgCache cache_;
+  ComputeMode mode_{ComputeMode::kForward};
+
+#ifdef PADDLE_WITH_TESTING
+  // test forward
+  friend class RNNAlgorithmTestHelper;
+  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
+  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
+  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
+  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
+  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
+// TODO(superjom) test backward
+#endif
+};
+
+class DynamicRecurrentOp : public framework::OperatorBase {
+ public:
+  DynamicRecurrentOp(const std::string& type,
+                     const framework::VariableNameMap& inputs,
+                     const framework::VariableNameMap& outputs,
+                     const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  DynamicRecurrentOp(const DynamicRecurrentOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override;
+
+  mutable RNNAlgorithm rnn;
+};
+
+class DynamicRecurrentGradientOp : public framework::OperatorBase {
+ public:
+  DynamicRecurrentGradientOp(const std::string& type,
+                             const framework::VariableNameMap& inputs,
+                             const framework::VariableNameMap& outputs,
+                             const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override;
+
+  mutable RNNAlgorithm rnn;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fff63efb24c70b7e864e2d5b011a22883c13dede
--- /dev/null
+++ b/paddle/operators/dynamic_recurrent_op_test.cc
@@ -0,0 +1,217 @@
+#include "paddle/operators/dynamic_recurrent_op.h"
+
+#include <gtest/gtest.h>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_desc.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/net_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Scope;
+using framework::TensorArray;
+using framework::LoDTensor;
+using framework::Variable;
+
+class TestOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+  DEFINE_OP_CLONE_METHOD(TestOp);
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+};
+
+void OpDescNewVar(const std::string& param_name,
+                  std::initializer_list<const char*> arguments,
+                  paddle::framework::OpDesc::Var* var) {
+  var->set_parameter(param_name);
+  for (auto& arg_name : arguments) {
+    var->add_arguments(arg_name);
+  }
+}
+
+// create a LoD tensor in scope with specific dims
+LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
+                     const platform::Place& place) {
+  auto* var = scope.Var(name);
+  auto* tensor = var->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(place);
+  return tensor;
+}
+
+class RNNAlgorithmTestHelper : public ::testing::Test {
+ protected:
+  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
+
+  virtual void SetUp() override {
+    CreateGlobalVariables();
+
+    auto op_desc = CreateOpDesc();
+    op = paddle::framework::OpRegistry::CreateOp(op_desc, nullptr);
+    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
+    InitCacheManually();
+    InitStepNet();
+  }
+
+  framework::OpDesc CreateOpDesc() {
+    // create op
+    paddle::framework::OpDesc op_desc;
+    op_desc.set_type("dynamic_recurrent");
+
+    OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
+    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
+    OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
+    OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
+
+    // set pre-states
+    auto pre_memories = op_desc.mutable_attrs()->Add();
+    pre_memories->set_name(argname.ex_states);
+    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
+    auto pre_memories_item = pre_memories->add_strings();
+    *pre_memories_item = "mem@pre";
+
+    // set states
+    auto memories = op_desc.mutable_attrs()->Add();
+    memories->set_name(argname.states);
+    memories->set_type(paddle::framework::AttrType::STRINGS);
+    auto memories_item = memories->add_strings();
+    *memories_item = "mem";
+    return op_desc;
+  }
+
+  void CreateGlobalVariables() {
+    platform::CPUPlace place;
+    scope.Var("step_scopes");
+    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
+    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
+    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
+    // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.
+    framework::LoD in0_lod(1);
+    for (int x : std::vector<int>{0, 4, 7, 9, 10}) {
+      in0_lod[0].push_back(x);
+    }
+    in0->set_lod(in0_lod);
+    in0->Resize(framework::make_ddim({10, 8}));
+    // set the content, each sentence content is seqid.batchid
+    // the seqid starts from 0
+    int start = 0;
+    for (size_t seqid = 0; seqid < in0_lod.size() - 1; seqid++) {
+      for (size_t batchid = 0;
+           batchid < in0_lod[0][seqid + 1] - in0_lod[0][seqid]; batchid++) {
+        float v = seqid + batchid * 0.1;
+
+        for (size_t dim = 0; dim < 8; dim++) {
+          in0->data<float>()[start * 8 + dim] = v;
+        }
+        start++;
+      }
+    }
+  }
+
+  void InitCacheManually() {
+    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
+                     &dop->arg_);
+  }
+
+  void InitStepNet() {
+    std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
+    dynamic_cast<NetOp*>(stepnet.get())
+        ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
+            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
+            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
+    dop->SetStepUnit(std::move(stepnet));
+  }
+
+ protected:
+  RNNAlgorithm* dop;
+  std::unique_ptr<framework::OperatorBase> op;
+  paddle::platform::CPUDeviceContext device_context;
+  paddle::framework::Scope scope;
+};
+
+TEST_F(RNNAlgorithmTestHelper, CreateCache) {
+  const rnn::Argument& arg = dop->arg_;
+  ASSERT_EQ(arg.inlinks.size(), 1UL);
+  ASSERT_EQ(arg.outlinks.size(), 1UL);
+}
+
+TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
+  dop->SplitInputs();
+  auto& in0_ta = dop->step_inputs_["in0"];
+  ASSERT_EQ(in0_ta.size(), 4UL);
+
+  const auto& batch0 = in0_ta.Read(0);
+  const auto& batch1 = in0_ta.Read(1);
+  const auto& batch2 = in0_ta.Read(2);
+  const auto& batch3 = in0_ta.Read(3);
+  EXPECT_EQ(batch0.dims()[0], 4);
+  EXPECT_EQ(batch1.dims()[0], 3);
+  EXPECT_EQ(batch2.dims()[0], 2);
+  EXPECT_EQ(batch3.dims()[0], 1);
+}
+
+TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
+  dop->SplitInputs();
+  dop->CreateScopes();
+  ASSERT_EQ(dop->cache_.num_steps, 4UL);
+  ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
+}
+
+TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
+  dop->SplitInputs();
+  dop->CreateScopes();
+  dop->WriteStepInputs();
+
+  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
+    auto& scope = dop->cache_.GetScope(step);
+    for (auto name : std::vector<std::string>({"in0"})) {
+      ASSERT_TRUE(scope.FindVar(name) != nullptr);
+    }
+  }
+}
+
+TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
+  dop->SplitInputs();
+  dop->CreateScopes();
+  dop->WriteStepInputs();
+  dop->WriteStepOutputs();
+
+  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
+    auto& scope = dop->cache_.GetScope(step);
+    for (auto name : std::vector<std::string>({"out0"})) {
+      ASSERT_TRUE(scope.FindVar(name));
+    }
+  }
+}
+
+TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
+  // Let's leave this test to python unittest.
+}
+
+TEST_F(RNNAlgorithmTestHelper, InitStates) {
+  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
+  dop->SplitInputs();
+  dop->CreateScopes();
+  dop->WriteStepInputs();
+  dop->WriteStepOutputs();
+  dop->InitStates();
+
+  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
+    auto& scope = dop->cache_.GetScope(step);
+    auto state = scope.FindVar("mem");
+    ASSERT_TRUE(state != nullptr);
+
+    auto* pre_state = scope.FindVar("mem@pre");
+    ASSERT_TRUE(pre_state != nullptr);
+
+    auto* boot_state = scope.FindVar("boot_mem");
+    ASSERT_TRUE(boot_state != nullptr);
+  }
+}
+
+}  // operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.h b/paddle/operators/elementwise_add_op.h
index e9f78ef26e05878053d968c35f17b456c128827a..f04fe3ec6069ab1bf227be6a3a5c10ee908e4824 100644
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseAddKernel : public framework::OpKernel {
+class ElementwiseAddKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
@@ -101,7 +101,7 @@ struct ElementwiseAddBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseAddGradKernel : public framework::OpKernel {
+class ElementwiseAddGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseAddGradFunctor<T>,
diff --git a/paddle/operators/elementwise_div_op.h b/paddle/operators/elementwise_div_op.h
index 99b6d9c1991edfb0018f8a459dfa373948cec434..8946ff3d25c2aff3dc3aa69368f0083371cd2fef 100644
--- a/paddle/operators/elementwise_div_op.h
+++ b/paddle/operators/elementwise_div_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseDivKernel : public framework::OpKernel {
+class ElementwiseDivKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenDivFunctor, Place, T>(ctx);
@@ -103,7 +103,7 @@ struct ElementwiseDivBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseDivGradKernel : public framework::OpKernel {
+class ElementwiseDivGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseDivGradFunctor<T>,
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index bda5dfe03e974740fe4a07191ae6b68ebfcd5d3a..da7765aa6a7a81c9e0b4f462022cad54c16aec47 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -36,7 +36,9 @@ REGISTER_OP(elementwise_mul, ops::ElementwiseOp, ops::ElementwiseMulOpMaker,
             elementwise_mul_grad, ops::ElementwiseOpGrad);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/elementwise_mul_op.cu b/paddle/operators/elementwise_mul_op.cu
index da08a75596c4d3b89dc8892bd4405464fec96389..056f081d3e6ac349978ff00689700c035bed8e39 100644
--- a/paddle/operators/elementwise_mul_op.cu
+++ b/paddle/operators/elementwise_mul_op.cu
@@ -19,7 +19,9 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul,
-    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     elementwise_mul_grad,
-    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>);
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, float>,
+    ops::ElementwiseMulGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/elementwise_mul_op.h b/paddle/operators/elementwise_mul_op.h
index 6ab642378bb0af8593ca0677014aede3c03cff8e..4469b07eaa08a3b011a88e58f1d645dd30b10ced 100644
--- a/paddle/operators/elementwise_mul_op.h
+++ b/paddle/operators/elementwise_mul_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseMulKernel : public framework::OpKernel {
+class ElementwiseMulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenMulFunctor, Place, T>(ctx);
@@ -102,7 +102,7 @@ struct ElementwiseMulBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseMulGradKernel : public framework::OpKernel {
+class ElementwiseMulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseMulGradFunctor<T>,
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index 3082f37422faa990bbf03c8a1a87b025d481a290..fce4b24a22f40c9cc57738273a758d0d48ff5e91 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -23,9 +23,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
   using Tensor = framework::Tensor;
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of elementwise op should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Y"),
@@ -105,8 +104,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
   using Tensor = framework::Tensor;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 3eb97f60b59848d23bcd15ea1e3d2f21b721f6a4..488a35aafc8600bb8bb252fc3a5161c72a2f6df1 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -108,7 +108,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
                     "Rank of first input must >= rank of second input.")
 
-  if (x_dims == y_dims || product(y_dims) == 1) {
+  if (x_dims == y_dims) {
     functor f;
     f.template Run<Place, T>(x, y, z, ctx);
     return;
@@ -174,12 +174,6 @@ void ElementwiseGradCompute(const framework::ExecutionContext& ctx) {
     return;
   }
 
-  if (product(y_dims) == 1) {
-    functor1 f;
-    f(place, x, y, out, dx, dy, dout);
-    return;
-  }
-
   int axis = ctx.Attr<int>("axis");
   axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
 
diff --git a/paddle/operators/elementwise_sub_op.h b/paddle/operators/elementwise_sub_op.h
index 3ca1376c73b3332b76a5973e201f9e4fba77cd21..3f40c1c5bcea5e8473765b039de4ee2a16054f0c 100644
--- a/paddle/operators/elementwise_sub_op.h
+++ b/paddle/operators/elementwise_sub_op.h
@@ -19,7 +19,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ElementwiseSubKernel : public framework::OpKernel {
+class ElementwiseSubKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseCompute<EigenSubFunctor, Place, T>(ctx);
@@ -102,7 +102,7 @@ struct ElementwiseSubBroadCast2GradFunctor {
 };
 
 template <typename Place, typename T>
-class ElementwiseSubGradKernel : public framework::OpKernel {
+class ElementwiseSubGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     ElementwiseGradCompute<Place, T, ElementwiseSubGradFunctor<T>,
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
deleted file mode 100644
index 5ac0e8cc45f007d42f1b6d7f86333f5cbedb3ea8..0000000000000000000000000000000000000000
--- a/paddle/operators/fc_op.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-class FCOp : public NetOp {
- public:
-  FCOp(const std::string &type, const framework::VariableNameMap &inputs,
-       const framework::VariableNameMap &outputs,
-       const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    PADDLE_ENFORCE(!Inputs("X").empty(),
-                   "Inputs(X) of FCOp should not be null.");
-    PADDLE_ENFORCE(!Inputs("W").empty(),
-                   "Inputs(W) of FCOp should not be null.");
-    PADDLE_ENFORCE(!Outputs("MulOut").empty(),
-                   "Outputs(MulOut) of FCOp should not be null.");
-    PADDLE_ENFORCE_NE(Output("Out"), framework::kEmptyVarName,
-                      "Output(Out) of FCOp should not be null.");
-
-    auto x = Inputs("X");
-    auto w = Inputs("W");
-    auto mul_out = Outputs("MulOut");
-    PADDLE_ENFORCE_EQ(
-        x.size(), w.size(),
-        "The size of inputs X(%d) should be the same as that of weights W(%d).",
-        x.size(), w.size());
-    PADDLE_ENFORCE_EQ(mul_out.size(), x.size(),
-                      "The size of intermediate mul_out(%d) should be the same "
-                      "as that of inputs X(%d).",
-                      mul_out.size(), x.size());
-
-    size_t n = x.size();
-    PADDLE_ENFORCE_GE(n, static_cast<size_t>(1),
-                      "The size of inputs X(%d) should be no less than 1.", n);
-
-    auto x_num_col_dims = Attr<std::vector<int>>("xNumColDims");
-
-    // Set all values or set no values (use the default value)
-    if (!x_num_col_dims.empty()) {
-      PADDLE_ENFORCE_EQ(x_num_col_dims.size(), n,
-                        "The size of attribute xNumColDims(%d) should be the "
-                        "same as that of inputs X(%d).",
-                        x_num_col_dims.size(), n);
-    } else {
-      x_num_col_dims.resize(n);
-      for (size_t i = 0; i < n; i++) {
-        x_num_col_dims[i] = 1;
-      }
-    }
-
-    // mul_out[i] = X[i] * W[i]
-    for (size_t i = 0; i < n; i++) {
-      framework::AttributeMap mul_attr;
-      mul_attr["x_num_col_dims"] = static_cast<int>(x_num_col_dims[i]);
-      mul_attr["y_num_col_dims"] = static_cast<int>(1);
-      AppendOp(
-          framework::OpRegistry::CreateOp("mul", {{"X", {x[i]}}, {"Y", {w[i]}}},
-                                          {{"Out", {mul_out[i]}}}, mul_attr));
-    }
-
-    // sum_out = X[0] * W[0] + ... + X[n-1] * W[n-1]
-    auto sum_out = mul_out[0];
-    if (n > 1) {
-      PADDLE_ENFORCE_NE(Output("SumOut"), framework::kEmptyVarName,
-                        "Output(SumOut) of FCOp should not be null when the "
-                        "size of Inputs(X) > 1.");
-
-      sum_out = Output("SumOut");
-      AppendOp(framework::OpRegistry::CreateOp("sum", {{"X", {mul_out}}},
-                                               {{"Out", {sum_out}}}, {}));
-    } else {
-      if (Output("SumOut") != framework::kEmptyVarName) {
-        this->Rename(Output("SumOut"), framework::kEmptyVarName);
-      }
-    }
-
-    // add_out = sum_out + b
-    auto b = Input("B");
-    auto add_out = sum_out;
-    if (b != framework::kEmptyVarName) {
-      PADDLE_ENFORCE_NE(
-          Output("AddOut"), framework::kEmptyVarName,
-          "Output(AddOut) of FCOp should not be null when Input(B) is set.");
-
-      add_out = Output("AddOut");
-      AppendOp(framework::OpRegistry::CreateOp(
-          "rowwise_add", {{"X", {sum_out}}, {"b", {Input("B")}}},
-          {{"Out", {add_out}}}, {}));
-    } else {
-      if (Output("AddOut") != framework::kEmptyVarName) {
-        this->Rename(Output("AddOut"), framework::kEmptyVarName);
-      }
-    }
-
-    auto activation = Attr<std::string>("activation");
-    AppendOp(framework::OpRegistry::CreateOp(activation, {{"X", {add_out}}},
-                                             {{"Y", {Output("Out")}}}, {}));
-    CompleteAddOp(false);
-  }
-};
-
-class FCOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FCOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "(A vector of Tensors) each input Tensor can be of arbitrary "
-             "dimension, and will be reshaped to a 2-D matrix of size "
-             "(minibatch, number_of_input_features) according to attribute "
-             "xNumColDims.")
-        .AsDuplicable();
-    AddInput("W",
-             "(A vector of Tensors) the weights of FC operator, a "
-             "vector of 2-D matrix of size "
-             "(number_of_input_features, number_of_neurons).")
-        .AsDuplicable();
-    AddInput("B",
-             "(Tensor) the bias of FC operator, a 1-D vector of size "
-             "number_of_neurons.");
-
-    AddOutput("Out",
-              "(Tensor) the activated output matrix of FC operator, a 2-D "
-              "matrix of size (minibatch, number_of_neurons).");
-    AddOutput("MulOut",
-              "(A vector of Tensors) the intermediate outputs of FC operator, "
-              "each Tensor saving the product of X_i * W_i.")
-        .AsIntermediate()
-        .AsDuplicable();
-    AddOutput(
-        "SumOut",
-        "(Tensor) the intermediate output of FC operator, "
-        "saving the sum of the products of X and W, that is sum{X_i * W_i}.")
-        .AsIntermediate();
-    AddOutput("AddOut",
-              "(Tensor) the non-actived output of FC operator, "
-              "saving sum{X_i * W_i} + B.")
-        .AsIntermediate();
-    AddAttr<std::string>(
-        "activation",
-        "(string, default identity) the activation type of FC operator.")
-        .SetDefault("identity")
-        .InEnum({"identity", "sigmoid", "softmax"});
-    AddAttr<std::vector<int>>(
-        "xNumColDims",
-        "(std::vector<int>) The inputs Tensors of FC operator can be of "
-        "more than 2 dimensions. In that case, each input Tensor `X_i` will be "
-        "reshaped to a 2-D matrix. The matrix's first dimension "
-        "(the length of column) will be the product of `X_i`'s last "
-        "`xNumColDims_i` dimensions, that is "
-        "`X_i.dims[0] x ... x X_i.dims[xNumColDims_i - 1]`. "
-        "The matrix's second dimension (the length of row) will be the product "
-        "of `X_i`'s first `rank - xNumColDims_i` dimensions, that is "
-        "`X_i.dims[xNumColDims_i] x ... x X_i.dims[rank - 1]`)")
-        .SetDefault(std::vector<int>{});
-
-    AddComment(R"DOC(
-Fully Connected Operator, known as Fully Connected Layer or Inner Product Layer
-in Convolutional Neural Networks. Neurons in a fully connected layer have
-full connections to all activations in the previous layer.
-It computes an inner product of a set of
-learned weights with a matrix multiplication followed by a bias offset
-(optionally).
-
-Equation:
-  Out = Act(sum_n{X_i * W_i} + B)
-
-where X_i is Tensor that will be reshaped to a 2-D matrix of size (M x K),
-usually M is the minibatch size and K is the number of input features.
-W_i is a 2-D matrix of size (K x N), where N means the number of neurons
-in the fully connected layer. B is a 1-D vector of size N.
-Thus, the output Out is a 2-D matrix of size (M x N).
-Activation type can be set to `identity` (default), `sigmoid` or `softmax`.
-
-All the inputs can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD with first input (`X[0]`).
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_WITHOUT_GRADIENT(fc, ops::FCOp, ops::FCOpMaker);
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0f1722a5383c80ff2ede0801d34f22a80fbc6e52
--- /dev/null
+++ b/paddle/operators/feed_op.cc
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class FeedOp : public framework::OperatorBase {
+ public:
+  FeedOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto feed_var_name = Input("X");
+    auto *feed_var = scope.FindVar(feed_var_name);
+
+    PADDLE_ENFORCE(feed_var != nullptr,
+                   "Cannot find feed_var in scope, feed_var_name is %s",
+                   feed_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = Attr<int>("col");
+
+    VLOG(3) << "Feed Var " << feed_var_name << "'s " << col << " column to var"
+            << out_name;
+
+    auto &feed_list = feed_var->Get<framework::FeedFetchList>();
+    auto &feed_item = feed_list.at(static_cast<size_t>(col));
+    auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
+    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    out_item->set_lod(feed_item.lod());
+  }
+};
+
+class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FeedOpInfoMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of feed op");
+    AddOutput("Out", "The output of feed op");
+    AddComment("feed op, it should not be configured by users directly");
+    AddAttr<int>("col", "column of feed");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(feed, paddle::operators::FeedOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FeedOpInfoMaker);
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c35d7d49e31f6ca11e2b37a455af430aac50a232
--- /dev/null
+++ b/paddle/operators/fetch_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/feed_fetch_type.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class FetchOp : public framework::OperatorBase {
+ public:
+  FetchOp(const std::string &type, const framework::VariableNameMap &inputs,
+          const framework::VariableNameMap &outputs,
+          const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto fetch_var_name = Input("X");
+    auto *fetch_var = scope.FindVar(fetch_var_name);
+    PADDLE_ENFORCE(fetch_var != nullptr,
+                   "Cannot find fetch variable in scope, fetch_var_name is %s",
+                   fetch_var_name);
+
+    auto out_name = this->Output("Out");
+    auto *out_var = scope.FindVar(out_name);
+    PADDLE_ENFORCE(out_var != nullptr,
+                   "Cannot find out_var in scope, out_var_name is %s",
+                   out_name);
+
+    auto col = static_cast<size_t>(Attr<int>("col"));
+
+    auto *fetch_list = out_var->GetMutable<framework::FeedFetchList>();
+    auto &src_item = fetch_var->Get<framework::FeedFetchType>();
+
+    if (col >= fetch_list->size()) {
+      fetch_list->resize(col + 1);
+    }
+    auto &dst_item = fetch_list->at(col);
+
+    // FIXME(yuyang18): Should we assume the fetch operator always generate
+    // CPU outputs?
+    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+    dst_item.set_lod(src_item.lod());
+
+    VLOG(3) << "Fetch variable " << fetch_var_name << " to " << out_name;
+  }
+};
+
+class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FetchOpInfoMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of fetch op");
+    AddOutput("Out", "The output of fetch op");
+    AddComment("fetch op, it should not be configured by users directly");
+    AddAttr<int>("col", "column of fetch");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(fetch, paddle::operators::FetchOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::FetchOpInfoMaker);
diff --git a/paddle/operators/fill_constant_op.cc b/paddle/operators/fill_constant_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0438d4d085f81d463253605b3aeca640a433a3b3
--- /dev/null
+++ b/paddle/operators/fill_constant_op.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_constant_op.h"
+
+namespace paddle {
+namespace operators {
+
+class FillConstantOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of FillConstantOp should not be null.");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    std::vector<int64_t> shape_int64(shape.size(), 0);
+    std::transform(shape.begin(), shape.end(), shape_int64.begin(),
+                   [](int a) { return static_cast<int64_t>(a); });
+    auto dims = framework::make_ddim(shape_int64);
+    ctx->SetOutputDim("Out", dims);
+  }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext &ctx) const override {
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+  }
+};
+
+class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FillConstantOpMaker(framework::OpProto *proto,
+                      framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<int>("data_type",
+                 "(int, default 5 (FP32)) "
+                 "Output data type")
+        .SetDefault(framework::DataType::FP32);
+    AddAttr<std::vector<int>>("shape", "(vector<int>) The shape of the output");
+    AddAttr<float>("value", "(float, default 0) The value to be filled")
+        .SetDefault(0.0f);
+    AddOutput("Out",
+              "(Tensor) Tensor of specified shape will be filled "
+              "with the specified value");
+    AddComment(R"DOC(Fill up a variable with specified constant value.)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(fill_constant, ops::FillConstantOp,
+                             ops::FillConstantOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_constant,
+    ops::FillConstantOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/fill_constant_op.cu
similarity index 75%
rename from paddle/operators/rowwise_add_op.cu
rename to paddle/operators/fill_constant_op.cu
index 4a57f64c890ce99d6060faec6a4a01b107403344..eef8fcbd7f65a9891126e039c4d46a106a6daa60 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/fill_constant_op.cu
@@ -13,11 +13,10 @@
    limitations under the License. */
 
 #define EIGEN_USE_GPU
-#include "paddle/operators/rowwise_add_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/fill_constant_op.h"
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    rowwise_add, ops::RowwiseAddKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(
-    rowwise_add_grad,
-    ops::RowwiseAddGradKernel<paddle::platform::GPUPlace, float>);
+    fill_constant,
+    ops::FillConstantOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/fill_constant_op.h b/paddle/operators/fill_constant_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..53b8b548eca6dfe035c326d95f91d3e279f63318
--- /dev/null
+++ b/paddle/operators/fill_constant_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class FillConstantOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    auto value = ctx.Attr<T>("value");
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*out);
+    auto place = ctx.GetEigenDevice<Place>();
+    out_eigen.device(place) = out_eigen.constant(static_cast<T>(value));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
index e164de6584e7350283781019cc74118c2d13646e..ed529ac40aaf179b35a9ab32e11ed7dbbe9289ba 100644
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -21,8 +21,7 @@ class FillZerosLikeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of FillZerosLikeOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Y"),
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
index 4474581784531faee1741f0b143743e31cc3788f..cdf56a723b117fe7b08ef2749aa2c2978c923d44 100644
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class FillZerosLikeKernel : public framework::OpKernel {
+class FillZerosLikeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* output = context.Output<framework::Tensor>("Y");
diff --git a/paddle/operators/gather.cu.h b/paddle/operators/gather.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d04ecd284226c7b4c6cdd5531915fee2d94ce61
--- /dev/null
+++ b/paddle/operators/gather.cu.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using platform::Place;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void GatherCUDAKernel(const T* params, const int* indices, T* output,
+                                 size_t index_size, size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int gather_i = indices[indices_i];
+    int params_i = gather_i * slice_size + slice_i;
+    *(output + i) = *(params + params_i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  GatherCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/gather.h b/paddle/operators/gather.h
index 92fb51ec17709bc6f8abb2f516a9240fb5dc3a77..052db49cb3c2594eca8b9a5e3716689480089703 100644
--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
@@ -24,49 +24,40 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-// Implementation of CPU copy
-template <typename T>
-void CPUGather(const T* src, const int* indices, const int slice_size,
-               const int index_size, T* output) {
-  const size_t slice_bytes = slice_size * sizeof(T);
-
-  for (int i = 0; i < index_size; ++i) {
-    int index_ = indices[i];
-    memcpy(output + i * slice_size, src + index_ * slice_size, slice_bytes);
-  }
-}
-
-// Implementation of GPU copy:
-template <typename T>
-void GPUGather(const T* src, const int* index, const int slice_size,
-               const int index_size, T* output);
+using framework::Tensor;
 
 /**
+ * A thin wrapper for gathering on cpu tensor
  * Return a new tensor from source tensor, gathered according to index
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
-            const paddle::framework::Tensor* index,
-            paddle::framework::Tensor* output) {
+void CPUGather(const platform::DeviceContext& ctx, const Tensor& src,
+               const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   framework::DDim output_dims(src_dims);
   output_dims[0] = index_size;
 
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
   // slice size
   int slice_size = 1;
   for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
 
-  // Gathering
-  if (platform::is_cpu_place(place)) {
-    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
-                 output->data<T>());
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes);
   }
 }
 
diff --git a/paddle/operators/gather_op.cc b/paddle/operators/gather_op.cc
index 0e3cd174adee1e50d0a63861286a26d325484efb..f6c7f472da24a1a60c0d2538ae643bdc8e55b10f 100644
--- a/paddle/operators/gather_op.cc
+++ b/paddle/operators/gather_op.cc
@@ -22,8 +22,7 @@ class GatherOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of GatherOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Index"),
@@ -31,22 +30,35 @@ class GatherOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GatherOp should not be null.");
 
+    auto index_dims = ctx->GetInputDim("Index");
+    PADDLE_ENFORCE(index_dims.size() == 1);
     int batch_size = ctx->GetInputDim("Index")[0];
     PADDLE_ENFORCE_GE(batch_size, 0, "Batch size must be >0");
     framework::DDim output_dims(ctx->GetInputDim("X"));
     output_dims[0] = batch_size;
     ctx->SetOutputDim("Out", output_dims);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class GatherGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("X")->type());
+  }
 };
 
 class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -69,8 +81,5 @@ Out = X[Index]
 namespace ops = paddle::operators;
 REGISTER_OP(gather, ops::GatherOp, ops::GatherOpMaker, gather_grad,
             ops::GatherGradOp);
-REGISTER_OP_CPU_KERNEL(gather,
-                       ops::GatherOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    gather_grad,
-    ops::GatherGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>);
diff --git a/paddle/operators/gather_op.cu b/paddle/operators/gather_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..92219d6a433e6db0bb9886ed8670cbafaa843ff8
--- /dev/null
+++ b/paddle/operators/gather_op.cu
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GatherOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
+
+    GPUGather<T>(ctx.device_context(), *x, *index, output);
+  }
+};
+
+template <typename T>
+class GatherGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto *x = ctx.Input<Tensor>("X");
+
+    dX->mutable_data<T>(ctx.GetPlace());
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto place = ctx.GetEigenDevice<platform::GPUPlace>();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    GPUScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gather, ops::GatherOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>);
diff --git a/paddle/operators/gather_op.h b/paddle/operators/gather_op.h
index 381854f301870beadb72d9e9b4eb17ff199960fb..8276ed0d3d8b676aafab45fae70942e78b72b8e6 100644
--- a/paddle/operators/gather_op.h
+++ b/paddle/operators/gather_op.h
@@ -23,29 +23,40 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-class GatherOpKernel : public framework::OpKernel {
+template <typename T>
+class GatherOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *X = ctx.Input<Tensor>("X");
-    auto *Index = ctx.Input<Tensor>("Index");
-    auto *Y = ctx.Output<Tensor>("Out");
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
+    auto *x = ctx.Input<Tensor>("X");
+    auto *index = ctx.Input<Tensor>("Index");
+    auto *output = ctx.Output<Tensor>("Out");
+
+    output->mutable_data<T>(ctx.GetPlace());
 
-    Y->mutable_data<T>(ctx.GetPlace());
-    Gather<T>(ctx.GetPlace(), X, Index, Y);
+    CPUGather<T>(ctx.device_context(), *x, *index, output);
   }
 };
 
-template <typename Place, typename T>
-class GatherGradientOpKernel : public framework::OpKernel {
+template <typename T>
+class GatherGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
+
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dX = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto *dO = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     dX->mutable_data<T>(ctx.GetPlace());
-    ScatterUpdate<T>(ctx.GetPlace(), dO, Index, dX);
+    auto dxt = framework::EigenVector<T>::Flatten(*dX);
+    auto place = ctx.GetEigenDevice<platform::CPUPlace>();
+    dxt.device(place) = dxt.constant(static_cast<T>(0));
+
+    ScatterAssign<T>(ctx.device_context(), *dO, *Index, dX);
   }
 };
 
diff --git a/paddle/operators/gather_test.cc b/paddle/operators/gather_test.cc
index 0ae1e99452973feb6d085dd6ef51e2afca988f59..cbd86b87961ee24aa889e208de5ac38e03a33135 100644
--- a/paddle/operators/gather_test.cc
+++ b/paddle/operators/gather_test.cc
@@ -41,7 +41,9 @@ TEST(Gather, GatherData) {
 
   int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
 
-  Gather<int>(CPUPlace(), src, index, output);
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  CPUGather<int>(ctx, *src, *index, output);
 
   for (int i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
   for (int i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
diff --git a/paddle/operators/gaussian_random_op.cc b/paddle/operators/gaussian_random_op.cc
index 05120a6e7bcfdb8641c722731f462c89e4223339..04dfdf7c48381240108cf924979764966599151f 100644
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -16,7 +16,7 @@ namespace paddle {
 namespace operators {
 
 template <typename T>
-class CPUGaussianRandomKernel : public framework::OpKernel {
+class CPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     float mean = context.Attr<float>("mean");
@@ -42,8 +42,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of GaussianRandomOp should not be null.");
     auto dims = ctx->Attrs().Get<std::vector<int>>("dims");
@@ -56,6 +55,12 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
                    "dims can be one int or array. dims must be set.");
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+  }
 };
 
 class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -76,6 +81,8 @@ Use to initialize tensor with gaussian random generator.
                  "Random seed of generator."
                  "0 means use system wide seed")
         .SetDefault(0);
+    AddAttr<int>("data_type", "output data type")
+        .SetDefault(framework::DataType::FP32);
   }
 };
 
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 2d63b3049988cfc3135a87a57dad56b970df3eab..315560bf1ba8a66b9a3b7d79510d202885e845d6 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -37,7 +37,7 @@ struct GaussianGenerator {
 };
 
 template <typename T>
-class GPUGaussianRandomKernel : public framework::OpKernel {
+class GPUGaussianRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a596f93769780419d27b7c0b40631d3da78e6700
--- /dev/null
+++ b/paddle/operators/gru_unit_op.cc
@@ -0,0 +1,208 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/gru_unit_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class GRUUnitOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitOp should not be null.", "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasOutput("Gate"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasOutput("ResetHiddenPrev"),
+                   "Output(%s) of GRUUnitOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(%s) of GRUUnitOp should not be null.", "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    if (ctx->HasInput("Bias")) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+    }
+    ctx->SetOutputDim("Gate", {batch_size, frame_size * 3});
+    ctx->SetOutputDim("ResetHiddenPrev", {batch_size, frame_size});
+    ctx->SetOutputDim("Hidden", {batch_size, frame_size});
+  }
+};
+
+class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GRUUnitOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+             "input.");
+    AddInput("HiddenPrev",
+             "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+             "states of previous time step.");
+    AddInput("Weight",
+             "(Tensor) Weight matrix with shape [frame_size, frame_size * 3]. "
+             "The elements continuous in memory can be divided into two parts. "
+             "The first part are weights of the update gate and reset gate "
+             "with shape [frame_size, frame_size * 2], and the second part are "
+             "weights of output candidate with shape [frame_size, frame_size]");
+    AddInput("Bias",
+             "(Tensor) Bias vector with shape [1, frame_size * 3] concating "
+             "bias of the update gate, reset gate and output candidate.")
+        .AsDispensable();
+    AddOutput("Gate",
+              "(Tensor) Matrix with shape [batch_size, frame_size * 3] for the "
+              "output of update gate, reset gate and output candidate")
+        .AsIntermediate();
+    AddOutput("ResetHiddenPrev",
+              "(Tensor) Matrix with shape [batch_size, frame_size] for the "
+              "reseted hidden state of previous time step.")
+        .AsIntermediate();
+    AddOutput("Hidden",
+              "(Tensor) The GRU hidden state of the current time step "
+              "with shape [batch_size, frame_size].");
+    AddAttr<int>("activation",
+                 "(enum int, default tanh) "
+                 "The activation type used for output candidate {h}_t.")
+        .SetDefault(tanh)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddAttr<int>("gate_activation",
+                 "(enum int, default sigmoid) "
+                 "The activation type used in update gate and reset gate.")
+        .SetDefault(sigmoid)
+        .InEnum({identity, sigmoid, tanh, relu});
+    AddComment(R"DOC(
+GRUUnitOp implements part calculations of the GRU unit as following:
+
+\f[
+update \ gate: u_t = actGate(xu_t + W_u * hidden_prev + bias_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * hidden_prev + bias_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, hidden_prev) + bias_c) \\
+output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_prev)
+\f]
+
+The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+)DOC");
+  }
+};
+
+class GRUUnitGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Input");
+    PADDLE_ENFORCE(ctx->HasInput("HiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "HiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Weight"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Weight");
+    PADDLE_ENFORCE(ctx->HasInput("Gate"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Gate");
+    PADDLE_ENFORCE(ctx->HasInput("ResetHiddenPrev"),
+                   "Input(%s) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput("Hidden"),
+                   "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Gate");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "ResetHiddenPrev");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
+                   "Hidden");
+    auto input_dims = ctx->GetInputDim("Input");
+    auto hidden_prev_dims = ctx->GetInputDim("HiddenPrev");
+    auto weight_dims = ctx->GetInputDim("Weight");
+    // int batch_size = input_dims[0];
+    int input_size = input_dims[1];
+    int frame_size = hidden_prev_dims[1];
+    int weight_height = weight_dims[0];
+    int weight_width = weight_dims[1];
+    PADDLE_ENFORCE_EQ(
+        input_size, frame_size * 3,
+        "The input_size must be 3 times of frame_size in GRUUnitOp.");
+    PADDLE_ENFORCE_EQ(
+        weight_height, frame_size,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    PADDLE_ENFORCE_EQ(
+        weight_width, frame_size * 3,
+        "The shape of Weight matrix must be [frame_size, frame_size * 3].");
+    auto bias = Input("Bias");
+    if (bias != framework::kEmptyVarName) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      int bias_height = bias_dims[0];
+      int bias_width = bias_dims[1];
+      PADDLE_ENFORCE_EQ(bias_height, 1,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      PADDLE_ENFORCE_EQ(bias_width, frame_size * 3,
+                        "The shape of Bias must be [1, frame_size * 3].");
+      auto bias_grad_name = framework::GradVarName("Bias");
+      if (ctx->HasOutput(bias_grad_name))
+        ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+    auto input_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(input_grad_name))
+      ctx->SetOutputDim(input_grad_name, input_dims);
+    auto hidden_prev_grad_name = framework::GradVarName("HiddenPrev");
+    if (ctx->HasOutput(hidden_prev_grad_name))
+      ctx->SetOutputDim(hidden_prev_grad_name, hidden_prev_dims);
+    auto weight_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(weight_grad_name))
+      ctx->SetOutputDim(weight_grad_name, weight_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gru_unit, ops::GRUUnitOp, ops::GRUUnitOpMaker, gru_unit_grad,
+            ops::GRUUnitGradOp);
+REGISTER_OP_CPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/gru_unit_op.cu b/paddle/operators/gru_unit_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..365f656523ddfb7ec8e2a5b885de74674823325a
--- /dev/null
+++ b/paddle/operators/gru_unit_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/gru_unit_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gru_unit,
+                       ops::GRUUnitKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    gru_unit_grad, ops::GRUUnitGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c53e7d9827e0395e6ce613302e732b2797f83cdd
--- /dev/null
+++ b/paddle/operators/gru_unit_op.h
@@ -0,0 +1,230 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/activation_op.h"
+#include "paddle/operators/math/math_function.h"
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
+
+template <typename Place, typename T>
+class GRUUnitKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y>
+  void ActCompute(const int act_type, const Device& d, X x, Y y) const {
+    if (act_type == identity)
+      y.device(d) = x;
+    else if (act_type == sigmoid)
+      SigmoidFunctor<T>()(d, x, y);
+    else if (act_type == tanh)
+      TanhFunctor<T>()(d, x, y);
+    else if (act_type == relu)
+      ReluFunctor<T>()(d, x, y);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* gate = context.Output<Tensor>("Gate");
+    gate->mutable_data<T>(context.GetPlace());
+    auto* reset_hidden_prev = context.Output<Tensor>("ResetHiddenPrev");
+    reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<Tensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    auto x = EigenMatrix<T>::From(*input);
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto r_h_p = EigenMatrix<T>::From(*reset_hidden_prev);
+    auto h = EigenMatrix<T>::From(*hidden);
+    auto place = context.GetEigenDevice<Place>();
+
+    // calculate unactivated gate outputs
+    if (bias) {
+      auto b = EigenMatrix<T>::From(*bias);
+      g.device(place) = x +
+                        b.reshape(Eigen::array<int, 2>({{1, frame_size * 3}}))
+                            .broadcast(Eigen::array<int, 2>({{batch_size, 1}}));
+    } else {
+      g.device(place) = x;
+    }
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* gate_data = gate->data<T>();
+    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
+                         2 * frame_size, frame_size, 1, hidden_prev_data,
+                         frame_size, weight_data, frame_size * 2, 1, gate_data,
+                         frame_size * 3);
+
+    // calculate activited gate
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(u_offsets, extents), g.slice(u_offsets, extents));
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    ActCompute(context.Attr<int>("gate_activation"), place,
+               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    r_h_p.device(place) = r * h_p;         // reset previous hidden state
+    math::gemm<Place, T>(context.device_context(), false, false, batch_size,
+                         frame_size, frame_size, 1, reset_hidden_prev_data,
+                         frame_size, weight_data + frame_size * frame_size * 2,
+                         frame_size, 1, gate_data + frame_size * 2,
+                         frame_size * 3);
+
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    ActCompute(context.Attr<int>("activation"), place,
+               g.slice(c_offsets, extents), g.slice(c_offsets, extents));
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // calculate final output
+    h.device(place) = u * (h_p - c) + c;
+  }
+};
+
+template <typename Place, typename T>
+class GRUUnitGradKernel : public framework::OpKernel<T> {
+ public:
+  template <typename Device, typename X, typename Y, typename DX, typename DY>
+  void ActGradCompute(const int act_type, const Device& d, X x, Y y, DX dx,
+                      DY dy) const {
+    // x is dummy and won't be used even in Relu(use y instead)
+    if (act_type == identity)
+      dx.device(d) = dy;
+    else if (act_type == sigmoid)
+      SigmoidGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == tanh)
+      TanhGradFunctor<T>()(d, x, y, dy, dx);
+    else if (act_type == relu)
+      ReluGradFunctor<T>()(d, x, y, dy, dx);
+    else
+      PADDLE_THROW("unsupported activation type");
+  }
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* input = context.Input<Tensor>("Input");
+    auto* hidden_prev = context.Input<Tensor>("HiddenPrev");
+    auto* weight = context.Input<Tensor>("Weight");
+    auto* gate = context.Input<Tensor>("Gate");
+    auto* reset_hidden_prev = context.Input<Tensor>("ResetHiddenPrev");
+    auto* hidden_grad = context.Input<Tensor>(framework::GradVarName("Hidden"));
+    auto* input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
+    auto* hidden_prev_grad =
+        context.Output<Tensor>(framework::GradVarName("HiddenPrev"));
+    auto* weight_grad =
+        context.Output<Tensor>(framework::GradVarName("Weight"));
+    auto* bias_grad = context.Output<Tensor>(framework::GradVarName("Bias"));
+    input_grad->mutable_data<T>(context.GetPlace());
+    hidden_prev_grad->mutable_data<T>(context.GetPlace());
+    weight_grad->mutable_data<T>(context.GetPlace());
+    Tensor gate_grad;
+    gate_grad.mutable_data<T>(input->dims(), context.GetPlace());
+    Tensor reset_hidden_prev_grad;
+    reset_hidden_prev_grad.mutable_data<T>(reset_hidden_prev->dims(),
+                                           context.GetPlace());
+
+    int batch_size = input->dims()[0];
+    int frame_size = hidden_prev->dims()[1];
+
+    const T* hidden_prev_data = hidden_prev->data<T>();
+    T* hidden_prev_grad_data = hidden_prev_grad->data<T>();
+    const T* weight_data = weight->data<T>();
+    T* weight_grad_data = weight_grad->data<T>();
+    T* gate_grad_data = gate_grad.data<T>();
+    const T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
+    T* reset_hidden_prev_grad_data = reset_hidden_prev_grad.data<T>();
+
+    auto h_p = EigenMatrix<T>::From(*hidden_prev);
+    auto g = EigenMatrix<T>::From(*gate);
+    auto d_h = EigenMatrix<T>::From(*hidden_grad);
+    auto d_x = EigenMatrix<T>::From(*input_grad);
+    auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
+    auto d_g = EigenMatrix<T>::From(gate_grad);
+    auto d_r_h_p = EigenMatrix<T>::From(reset_hidden_prev_grad);
+    auto place = context.GetEigenDevice<Place>();
+
+    Eigen::array<int, 2> extents({{batch_size, frame_size}});
+    Eigen::array<int, 2> u_offsets({{0, 0}});
+    auto u = g.slice(u_offsets, extents);  // update gate
+    Eigen::array<int, 2> r_offsets({{0, frame_size}});
+    auto r = g.slice(r_offsets, extents);  // reset gate
+    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
+    auto c = g.slice(c_offsets, extents);  // output candidate
+
+    // backward for unactivated update gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
+                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+    // backward for unactivated output candidate
+    ActGradCompute(context.Attr<int>("activation"), place, c, c,
+                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+    // backward for reset_hidden_prev
+    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                         frame_size, frame_size, 1,
+                         gate_grad_data + frame_size * 2, frame_size * 3,
+                         weight_data + frame_size * frame_size * 2, frame_size,
+                         0, reset_hidden_prev_grad_data, frame_size);
+    // backward for state_weight
+    math::gemm<Place, T>(
+        context.device_context(), true, false, frame_size, frame_size,
+        batch_size, 1, reset_hidden_prev_data, frame_size,
+        gate_grad_data + frame_size * 2, frame_size * 3, 0,
+        weight_grad_data + frame_size * frame_size * 2, frame_size);
+    // backward for unactivated reset gate
+    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
+                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
+    // backward for update_gate_weight and reset_gate_weight
+    math::gemm<Place, T>(context.device_context(), true, false, frame_size,
+                         frame_size * 2, batch_size, 1, hidden_prev_data,
+                         frame_size, gate_grad_data, frame_size * 3, 0,
+                         weight_grad_data, frame_size * 2);
+    // backward for hidden_prev
+    d_h_p.device(place) = d_r_h_p * r + d_h * u;
+    math::gemm<Place, T>(context.device_context(), false, true, batch_size,
+                         frame_size, frame_size * 2, 1, gate_grad_data,
+                         frame_size * 3, weight_data, frame_size * 2, 1,
+                         hidden_prev_grad_data, frame_size);
+    // backward for input
+    d_x.device(place) = d_g;
+    // backward for bias
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(context.GetPlace());
+      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/identity_op.cc b/paddle/operators/identity_op.cc
deleted file mode 100644
index 2cc632205e63abbe412b09af4b894420ac512ec5..0000000000000000000000000000000000000000
--- a/paddle/operators/identity_op.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/net_op.h"
-#include "paddle/operators/scale_op.h"
-
-namespace paddle {
-namespace operators {
-
-// The identity operator is an alias of the scale operator. This is also an
-// example for creating an alias for an existing operator.
-template <typename AttrType>
-class IdentityOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  IdentityOpMaker(framework::OpProto *proto,
-                  framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of identity operator.");
-    AddOutput("Y", "The output tensor of identity operator.");
-    AddComment(R"DOC(
-The identity operator is an alias of the scale operator
-with the attribute scale fixed to 1.0.
-)DOC");
-  }
-};
-
-template <typename AttrType>
-class IdentityOp : public NetOp {
- public:
-  IdentityOp(const std::string &type, const framework::VariableNameMap &inputs,
-             const framework::VariableNameMap &outputs,
-             const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    PADDLE_ENFORCE_NE(Input("X"), framework::kEmptyVarName,
-                      "Input(X) of IdentityOp should not be null.");
-    PADDLE_ENFORCE_NE(Output("Y"), framework::kEmptyVarName,
-                      "Output(Y) of IdentityOp should not be null.");
-
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input("X")}}}, {{"Out", {Output("Y")}}},
-        {{"scale", static_cast<AttrType>(1)}}));
-    CompleteAddOp(false);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_WITHOUT_GRADIENT(identity, ops::IdentityOp<float>,
-                             ops::IdentityOpMaker<float>);
diff --git a/paddle/operators/images/batch_norm_fork.dot b/paddle/operators/images/batch_norm_fork.dot
new file mode 100644
index 0000000000000000000000000000000000000000..4bc47713cba2cb23f1b34fffe6426ef10ac3a9df
--- /dev/null
+++ b/paddle/operators/images/batch_norm_fork.dot
@@ -0,0 +1,25 @@
+digraph ImageBatchNormForkGragh {
+  subgraph cluster_before {
+    Prev [label="...", shape=plaintext];
+    Rnn [label="rnn_op", shape=box];
+    BatchNorm [label="batch_norm_op", shape=box];
+    Fc [label="fc_op", shape=box];
+    After [label="...", shape=plaintext];
+    Prev -> Rnn -> BatchNorm -> Fc -> After;
+    label="original";
+  }
+
+  subgraph cluster_after {
+    Prev2 [label="...", shape=plaintext];
+    Rnn2 [label="rnn_op", shape=box];
+    BatchNorm2_1 [label="train_batch_norm_op", shape=box];
+    BatchNorm2_2 [label="infer_batch_norm_op", shape=box];
+    Fc2_1 [label="fc_op", shape=box];
+    Fc2_2 [label="fc_op", shape=box];
+    After2_1 [label="...", shape=plaintext];
+    After2_2 [label="...", shape=plaintext];
+    Prev2 -> Rnn2 -> BatchNorm2_1 -> Fc2_1 -> After2_1;
+    Rnn2 -> BatchNorm2_2 ->Fc2_2 ->After2_2
+    label="forked";
+  }
+}
diff --git a/paddle/operators/images/batch_norm_fork.png b/paddle/operators/images/batch_norm_fork.png
new file mode 100644
index 0000000000000000000000000000000000000000..aded62bce5bc268b7a3ef4dc96c89fe21d6ea955
Binary files /dev/null and b/paddle/operators/images/batch_norm_fork.png differ
diff --git a/paddle/operators/images/batch_norm_op_kernel.png b/paddle/operators/images/batch_norm_op_kernel.png
new file mode 100644
index 0000000000000000000000000000000000000000..a99ce81ff3bf42880ebbd6a1297de3bf038e09b2
Binary files /dev/null and b/paddle/operators/images/batch_norm_op_kernel.png differ
diff --git a/paddle/operators/increment_op.cc b/paddle/operators/increment_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..139392c691e00b2a94f46801f1cfc2018ce139f5
--- /dev/null
+++ b/paddle/operators/increment_op.cc
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/increment_op.h"
+
+namespace paddle {
+namespace operators {
+
+class IncrementOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of IncrementOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of IncrementOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+template <typename AttrType>
+class IncrementOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  IncrementOpMaker(framework::OpProto *proto,
+                   framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) The input tensor of increment operator");
+    AddOutput("Out", "(Tensor) The output tensor of increment operator.");
+    AddComment(R"DOC(Increment operator
+
+The equation is: Out = X + step
+)DOC");
+    AddAttr<AttrType>("step",
+                      "The step size by which the "
+                      "input tensor will be incremented.")
+        .SetDefault(1.0);
+  }
+};
+
+class IncrementGradOpMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", 1.0f);
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(increment, ops::IncrementOp, ops::IncrementOpMaker<float>,
+                  ops::IncrementGradOpMaker);
+REGISTER_OP_CPU_KERNEL(increment,
+                       ops::IncrementKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/increment_op.cu b/paddle/operators/increment_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..659c380d147a36650452bea23b30cbcf1ff516ee
--- /dev/null
+++ b/paddle/operators/increment_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/increment_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    increment,
+    paddle::operators::IncrementKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/increment_op.h b/paddle/operators/increment_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..342e254fc453555c70923efbca02fdfd014af015
--- /dev/null
+++ b/paddle/operators/increment_op.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename AttrType = T>
+class IncrementKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& context) const {
+    auto* tensor = context.Output<framework::Tensor>("Out");
+    auto* in = context.Input<framework::Tensor>("X");
+    tensor->mutable_data<T>(in->place());
+
+    auto step = static_cast<T>(context.Attr<AttrType>("step"));
+
+    auto eigen_out = framework::EigenVector<T>::Flatten(*tensor);
+    auto eigen_in = framework::EigenVector<T>::Flatten(*in);
+    auto& place = context.GetEigenDevice<Place>();
+    eigen_out.device(place) = eigen_in + step;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lookup_table_op.cc b/paddle/operators/lookup_table_op.cc
index 9b1314bfbade8551d98b0fbabb7c2968d7600db5..ad86a2e5bc23b2b0ea853971cf79dec745e9706a 100644
--- a/paddle/operators/lookup_table_op.cc
+++ b/paddle/operators/lookup_table_op.cc
@@ -21,8 +21,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("W"),
                    "Input(W) of LookupTableOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Ids"),
@@ -33,9 +32,18 @@ class LookupTableOp : public framework::OperatorWithKernel {
     auto table_dims = ctx->GetInputDim("W");
     auto ids_dims = ctx->GetInputDim("Ids");
 
+    PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
+    PADDLE_ENFORCE_EQ(ids_dims[1], 1);
+
     ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
     ctx->ShareLoD("Ids", /*->*/ "Out");
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("W")->type());
+  }
 };
 
 class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -48,7 +56,9 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
              " which is a learnable parameter.");
     AddInput("Ids",
              "An input with type int32 or int64"
-             "contains the ids to be looked up in W.");
+             "contains the ids to be looked up in W."
+             "Ids must be a column vector with rank = 2."
+             "The 2nd dimension size must be 1");
     AddOutput("Out", "The lookup results, which have the same type with W.");
     AddComment(R"DOC(
 This operator is used to perform lookups on the parameter W,
@@ -64,11 +74,16 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     auto table_dims = ctx->GetInputDim("W");
     ctx->SetOutputDim(framework::GradVarName("W"), table_dims);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("W")->type());
+  }
 };
 
 }  // namespace operators
diff --git a/paddle/operators/lookup_table_op.cu b/paddle/operators/lookup_table_op.cu
index 62f63b4f3c876e084e2468001e8bcb9310d16a82..c3808fa9a8de031fcae3ac0417e8c4330b2f5aad 100644
--- a/paddle/operators/lookup_table_op.cu
+++ b/paddle/operators/lookup_table_op.cu
@@ -61,7 +61,7 @@ __global__ void LookupTableGrad(T* table, const T* output, const int32_t* ids,
 }
 
 template <typename T>
-class LookupTableCUDAKernel : public framework::OpKernel {
+class LookupTableCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto table_t = context.Input<Tensor>("W");
@@ -85,7 +85,7 @@ class LookupTableCUDAKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class LookupTableGradCUDAKernel : public framework::OpKernel {
+class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto ids_t = context.Input<Tensor>("Ids");
diff --git a/paddle/operators/lookup_table_op.h b/paddle/operators/lookup_table_op.h
index a1298906dd4b4209644fe06584f70169519de01c..dfead2fc5b25b9be26bb19cd74a3a94daf62cca6 100644
--- a/paddle/operators/lookup_table_op.h
+++ b/paddle/operators/lookup_table_op.h
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename T>
-class LookupTableKernel : public framework::OpKernel {
+class LookupTableKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto table_t = context.Input<Tensor>("W");      // float tensor
@@ -44,7 +44,7 @@ class LookupTableKernel : public framework::OpKernel {
 };
 
 template <typename T>
-class LookupTableGradKernel : public framework::OpKernel {
+class LookupTableGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto ids_t = context.Input<Tensor>("Ids");
diff --git a/paddle/operators/lstm_op.cc b/paddle/operators/lstm_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0a089b7c2dc1e05224525bc4fe5399ec39036d01
--- /dev/null
+++ b/paddle/operators/lstm_op.cc
@@ -0,0 +1,226 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/lstm_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LSTMOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"),
+                   "Input(Input) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Hidden"),
+                   "Output(Hidden) of LSTM should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Cell"),
+                   "Output(Cell) of LSTM should not be null.");
+
+    auto x_dims = ctx->GetInputDim("Input");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank must be 2.");
+
+    if (ctx->HasInput("H0")) {
+      PADDLE_ENFORCE(ctx->HasInput("C0"),
+                     "Input(Cell) and Input(Hidden) of LSTM should not "
+                     "be null at the same time.");
+      auto h_dims = ctx->GetInputDim("H0");
+      auto c_dims = ctx->GetInputDim("C0");
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
+
+    int frame_size = x_dims[1] / 4;
+    auto w_dims = ctx->GetInputDim("Weight");
+    PADDLE_ENFORCE_EQ(w_dims.size(), 2,
+                      "The rank of Input(Weight) should be 2.");
+    PADDLE_ENFORCE_EQ(w_dims[0], frame_size,
+                      "The first dimension of Input(Weight) "
+                      "should be %d.",
+                      frame_size);
+    PADDLE_ENFORCE_EQ(w_dims[1], 4 * frame_size,
+                      "The second dimension of Input(Weight) "
+                      "should be 4 * %d.",
+                      frame_size);
+    auto b_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(b_dims.size(), 2, "The rank of Input(Bias) should be 2.");
+    PADDLE_ENFORCE_EQ(b_dims[0], 1,
+                      "The first dimension of Input(Bias) should be 1.");
+    if (ctx->Attrs().Get<bool>("usePeepholes")) {
+      PADDLE_ENFORCE_EQ(b_dims[1], 7 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "7 * %d if enable peepholes connection",
+                        frame_size);
+    } else {
+      PADDLE_ENFORCE_EQ(b_dims[1], 4 * frame_size,
+                        "The second dimension of Input(Bias) should be "
+                        "4 * %d if disable peepholes connection",
+                        frame_size);
+    }
+    ctx->SetOutputDim("Hidden", {x_dims[0], frame_size});
+    ctx->SetOutputDim("Cell", {x_dims[0], frame_size});
+    ctx->SetOutputDim("BatchGate", x_dims);
+    ctx->ShareLoD("Input", "Hidden");
+    ctx->ShareLoD("Input", "Cell");
+  }
+};
+
+class LSTMOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LSTMOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input",
+             "(LoDTensor) the first input is a LodTensor, which support "
+             "variable-time length input sequence. The underlying tensor in "
+             "this LoDTensor is a matrix with shape (T X 4D), where, T is the "
+             "total time steps in this mini-batch, D is the hidden size.");
+    AddInput("H0",
+             "(Tensor, optional) the initial hidden state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size, D is the hidden size.");
+    AddInput("C0",
+             "(Tensor, optional) the initial cell state is an optional "
+             "input. This is a tensor with shape (N x D), where N is the "
+             "batch size. `H0` and `C0` can be NULL but only at the same time");
+    AddInput("Weight",
+             "(Tensor) the learnable hidden-hidden weights."
+             " - The shape is (D x 4D), where D is the hidden size. "
+             " - Weight = {W_ch, W_ih, W_fh, W_oh}");
+    AddInput("Bias",
+             "(Tensor) the learnable weights, which contains two parts: "
+             "input-hidden bias weight and peephole connections weight if "
+             "setting `usePeepholes` True. "
+             "1. `usePeepholes = False` "
+             " - The shape is (1 x 4D). "
+             " - Bias = {b_c, b_i, b_f, b_o}."
+             "2. `usePeepholes = True` "
+             " - The shape is (1 x 7D). "
+             " - Bias = {b_c, b_i, b_f, b_o, W_ic, W_fc, W_oc}.");
+    AddOutput("BatchGate",
+              "(LoDTensor) This LoDTensor contains input gate, forget gate "
+              "and output gate after the nonlinear computation. This "
+              "LoDTensor has the same shape with the reorganized input, which "
+              "was also be called batch input. The LoD size is 2. The first "
+              "LoD is the batch offsets and the second LoD contains the "
+              "indexes, which denote the position of reorganized sequence "
+              "in the raw input.")
+        .AsIntermediate();
+    AddOutput("Hidden",
+              "(LoDTensor) the hidden state lod tensor of LSTM operator. "
+              "The shape and lod is the same with the `Input`.");
+    AddOutput("Cell",
+              "(LoDTensor) the cell state lod tensor of LSTM operator. "
+              "The shape and lod is the same with the `Input`.");
+    AddAttr<bool>("usePeepholes",
+                  "(bool, defalut: True) "
+                  "whether to enable diagonal/peephole connections.")
+        .SetDefault(true);
+    AddAttr<bool>("isReverse",
+                  "(bool, defalut: False) "
+                  "whether to compute reversed LSTM.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "gateActivation",
+        "(string, default: sigmoid)"
+        "The activation for input gate, forget gate and output "
+        "gate, `sigmoid` by default.")
+        .SetDefault("sigmoid");
+    AddAttr<std::string>("cellActivation",
+                         "(string, default: tanh)"
+                         "The activation for cell output, `tanh` by defalut.")
+        .SetDefault("tanh");
+    AddAttr<std::string>("candidateActivation",
+                         "(string, default: tanh)"
+                         "The activation for candidate hidden state, "
+                         "`tanh` by default.")
+        .SetDefault("tanh");
+    AddComment(R"DOC(Long-Short Term Memory (LSTM) Operator
+
+The defalut implementation is diagonal/peephole connection [1], the formula is
+as follows
+
+    i_t = \sigma(W_{ix}x_{t} + W_{ih}h_{t-1} + W_{ic}c_{t-1} + b_i)
+
+    f_t = \sigma(W_{fx}x_{t} + W_{fh}h_{t-1} + W_{fc}c_{t-1} + b_f)
+
+    \tilde{c_t} = act_g(W_{cx}x_t + W_{ch}h_{t-1} + b_c)
+
+    o_t = \sigma(W_{ox}x_{t} + W_{oh}h_{t-1} + W_{oc}c_t + b_o)
+
+    c_t = f_t ⊙ c_{t-1} + i_t ⊙ \tilde{c_t}
+
+    h_t = o_t ⊙ act_h(c_t)
+
+where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
+of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
+are diagonal weight matrices for peephole connections. In our implenmention,
+We use vectors to reprenset these diagonal weight matrices. The b terms
+denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
+is the non-line actications, such as logistic sigmoid function, and
+\f$i, f, o\f$ and \f$c\f$ are respectively the input gate, forget gate,
+output gate and cell activation vectors, all of which are the same size as
+the cell output activation vector \f$h\f$.
+
+The ⊙ is the element-wise product of the vectors, \f$act_g\f$ and \f$act_h\f$
+are the cell input and cell output activation functions, `tanh` is usually
+used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
+which is computed based on the current input and the previous hidden state.
+
+Set `usePeepholes` False to disable peephole connection [2]. The formula
+is omitted here.
+
+@note These \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+operations on the input x_{t} were NOT included in this operator.
+Users can choose to use fully-connect operator before LSTM operator.
+
+[1] Hasim Sak, Andrew Senior, and Francoise Beaufays. Long short-term memory
+recurrent neural network architectures for large scale acoustic modeling.
+INTERSPEECH, 2014.
+
+[2] S. Hochreiter and J. Schmidhuber. Long Short-Term Memory.
+Neural Computation, 9(8):1735-1780, 1997.
+
+)DOC");
+  }
+};
+
+class LSTMGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
+                   "Input(Hidden@GRAD) should not be null");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cell")),
+                   "Input(Cell@GRAD) should not be null");
+    ctx->SetOutputDim(framework::GradVarName("Weight"),
+                      ctx->GetInputDim("Weight"));
+    ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->GetInputDim("Bias"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(lstm, ops::LSTMOp, ops::LSTMOpMaker, lstm_grad, ops::LSTMGradOp);
+REGISTER_OP_CPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::CPUPlace, float>,
+                       ops::LSTMKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(lstm_grad,
+                       ops::LSTMGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::LSTMGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lstm_op.cu b/paddle/operators/lstm_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9ad56941553bf19a56c25f41f76fe20dfa3a106f
--- /dev/null
+++ b/paddle/operators/lstm_op.cu
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/lstm_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(lstm, ops::LSTMKernel<paddle::platform::GPUPlace, float>,
+                       ops::LSTMKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(lstm_grad,
+                       ops::LSTMGradKernel<paddle::platform::GPUPlace, float>,
+                       ops::LSTMGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/lstm_op.h b/paddle/operators/lstm_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0af5694c48fcb4437e3acd422606de013bb2e145
--- /dev/null
+++ b/paddle/operators/lstm_op.h
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::LoDTensor;
+using framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class LSTMKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<framework::LoDTensor>("Input");
+    auto* weight = ctx.Input<framework::Tensor>("Weight");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+
+    auto* batch_gate = ctx.Output<framework::LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(ctx.GetPlace());
+    auto* hidden_out = ctx.Output<framework::LoDTensor>("Hidden");
+    hidden_out->mutable_data<T>(ctx.GetPlace());
+    auto* cell_out = ctx.Output<framework::LoDTensor>("Cell");
+    cell_out->mutable_data<T>(ctx.GetPlace());
+
+    // Now the function ShareLoD in InferShape is not implemented.
+    // So copy LoD here.
+    ctx.ShareLoD("Input", "Hidden");
+    ctx.ShareLoD("Input", "Cell");
+
+    bool is_reverse = ctx.Attr<bool>("isReverse");
+    math::LoDTensor2BatchFunctor<Place, T> to_batch;
+    to_batch(ctx.device_context(), *input, *batch_gate, is_reverse);
+
+    auto in_dims = input->dims();
+    int frame_size = static_cast<int>(in_dims[1] / 4);
+    framework::DDim dims({in_dims[0], frame_size});
+
+    if (bias) {
+      Eigen::array<int, 2> extents({{1, 4 * frame_size}});
+      Eigen::array<int, 2> offsets({{0, 0}});
+      auto b = EigenMatrix<T>::From(*bias);
+      auto gate = EigenMatrix<T>::From(*batch_gate);
+      gate.device(ctx.GetEigenDevice<Place>()) =
+          gate +
+          b.slice(offsets, extents)
+              .reshape(Eigen::array<int, 2>({{1, frame_size * 4}}))
+              .broadcast(
+                  Eigen::array<int, 2>({{static_cast<int>(in_dims[0]), 1}}));
+    }
+
+    math::LstmMetaValue<T> lstm_value;
+    T* bias_data = const_cast<T*>(bias->data<T>());
+    // the code style in LstmMetaValue will be updated later.
+    lstm_value.checkIg = bias_data + 4 * frame_size;
+    lstm_value.checkFg = lstm_value.checkIg + frame_size;
+    lstm_value.checkOg = lstm_value.checkFg + frame_size;
+    lstm_value.prevStateValue = nullptr;
+
+    framework::LoDTensor batch_out, batch_cell, batch_cell_pre_act;
+    batch_out.mutable_data<T>(dims, ctx.GetPlace());
+    batch_cell.mutable_data<T>(dims, ctx.GetPlace());
+    batch_cell_pre_act.mutable_data<T>(dims, ctx.GetPlace());
+
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto gate_act = ctx.Attr<std::string>("gateActivation");
+    auto cell_act = ctx.Attr<std::string>("cellActivation");
+    auto cand_act = ctx.Attr<std::string>("candidateActivation");
+
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor out_t = batch_out.Slice(bstart, bend);
+      Tensor cell_t = batch_cell.Slice(bstart, bend);
+      Tensor cell_pre_act_t = batch_cell_pre_act.Slice(bstart, bend);
+
+      int cur_batch_size = bend - bstart;
+
+      if (n != 0) {
+        int pre_h_start = static_cast<int>(batch_starts[n - 1]);
+        int pre_h_end = pre_h_start + cur_batch_size;
+        auto pre_hidden_t = batch_out.Slice(pre_h_start, pre_h_end);
+        math::matmul<Place, T>(ctx.device_context(), pre_hidden_t, false,
+                               *weight, false, static_cast<T>(1.0), &gate_t,
+                               static_cast<T>(1.0));
+      }
+      // else if : FIXME support the initial hidden and cell
+
+      lstm_value.gateValue = gate_t.data<T>();
+      lstm_value.outputValue = out_t.data<T>();
+      lstm_value.stateValue = cell_t.data<T>();
+      lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
+      math::LstmUnitFunctor<Place, T>::compute(ctx.device_context(), lstm_value,
+                                               frame_size, cur_batch_size,
+                                               gate_act, cell_act, cand_act);
+      lstm_value.prevStateValue = lstm_value.stateValue;
+    }
+
+    math::Batch2LoDTensorFunctor<Place, T> to_seq;
+    batch_out.set_lod(batch_gate->lod());
+    // restore the output hidden in LoDTensor from the batch hidden
+    to_seq(ctx.device_context(), batch_out, *hidden_out);
+
+    batch_cell.set_lod(batch_gate->lod());
+    // restore the output cell state in LoDTensor from the batch cell
+    to_seq(ctx.device_context(), batch_cell, *cell_out);
+  }
+};
+
+template <typename Place, typename T>
+class LSTMGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/lstm_unit_op.cc b/paddle/operators/lstm_unit_op.cc
index bd75b001cb87d914f6c56ea35dcb5013d68145b2..5d63017208a55ec4bcc2e8d66f1ca2e1b84d4593 100644
--- a/paddle/operators/lstm_unit_op.cc
+++ b/paddle/operators/lstm_unit_op.cc
@@ -21,8 +21,7 @@ class LstmUnitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of LSTM should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("C_prev"),
                    "Input(C_prev) of LSTM should not be null.");
@@ -47,7 +46,6 @@ class LstmUnitOp : public framework::OperatorWithKernel {
   }
 };
 
-template <typename AttrType>
 class LstmUnitOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   LstmUnitOpMaker(framework::OpProto* proto,
@@ -68,7 +66,7 @@ Equation:
   H = C * sigm(o)
 
 )DOC");
-    AddAttr<AttrType>("forget_bias", "The forget bias of Lstm Unit.")
+    AddAttr<float>("forget_bias", "The forget bias of Lstm Unit.")
         .SetDefault(0.0);
   }
 };
@@ -77,8 +75,7 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("C")),
                    "Input(C@GRAD) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("H")),
@@ -93,9 +90,11 @@ class LstmUnitGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker<float>,
-            lstm_unit_grad, ops::LstmUnitGradOp);
+REGISTER_OP(lstm_unit, ops::LstmUnitOp, ops::LstmUnitOpMaker, lstm_unit_grad,
+            ops::LstmUnitGradOp);
 REGISTER_OP_CPU_KERNEL(lstm_unit,
-                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>);
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, float>,
+                       ops::LstmUnitKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>);
+    lstm_unit_grad, ops::LstmUnitGradKernel<paddle::platform::CPUPlace, float>,
+    ops::LstmUnitGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/lstm_unit_op.cu b/paddle/operators/lstm_unit_op.cu
index 6e5e4978994c281416a65af5f8ffdec688768d63..49ea550b6f49a13bf31d14321d7a9eb13a834d4b 100644
--- a/paddle/operators/lstm_unit_op.cu
+++ b/paddle/operators/lstm_unit_op.cu
@@ -89,8 +89,8 @@ __global__ void LSTMUnitGradientKernel(const int nthreads, const int dim,
   }
 }
 
-template <typename T, typename AttrType = T>
-class LstmUnitOpCUDAKernel : public framework::OpKernel {
+template <typename T>
+class LstmUnitOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -101,7 +101,7 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel {
     auto* c_tensor = ctx.Output<framework::Tensor>("C");
     auto* h_tensor = ctx.Output<framework::Tensor>("H");
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     int b_size = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
@@ -120,8 +120,8 @@ class LstmUnitOpCUDAKernel : public framework::OpKernel {
   }
 };
 
-template <typename T, typename AttrType = T>
-class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
+template <typename T>
+class LstmUnitGradOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
@@ -153,7 +153,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
     int N = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     int block = 512;
     int n = N * D;
@@ -169,5 +169,7 @@ class LstmUnitGradOpCUDAKernel : public framework::OpKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>);
-REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(lstm_unit, ops::LstmUnitOpCUDAKernel<float>,
+                       ops::LstmUnitOpCUDAKernel<double>);
+REGISTER_OP_GPU_KERNEL(lstm_unit_grad, ops::LstmUnitGradOpCUDAKernel<float>,
+                       ops::LstmUnitGradOpCUDAKernel<double>);
diff --git a/paddle/operators/lstm_unit_op.h b/paddle/operators/lstm_unit_op.h
index 683034fe15df8cabfdff5e856adb5c0467055064..625b1852c2f0eb2ed435f73fea251c40c614a7dd 100644
--- a/paddle/operators/lstm_unit_op.h
+++ b/paddle/operators/lstm_unit_op.h
@@ -19,7 +19,6 @@
 namespace paddle {
 namespace operators {
 
-using framework::LoDTensor;
 using framework::Tensor;
 
 template <typename T>
@@ -32,8 +31,8 @@ inline T tanh(T x) {
   return 2. * sigmoid(2. * x) - 1.;
 }
 
-template <typename Place, typename T, typename AttrType = T>
-class LstmUnitKernel : public framework::OpKernel {
+template <typename Place, typename T>
+class LstmUnitKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -44,7 +43,7 @@ class LstmUnitKernel : public framework::OpKernel {
     auto* c_tensor = ctx.Output<framework::Tensor>("C");
     auto* h_tensor = ctx.Output<framework::Tensor>("H");
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     int b_size = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
@@ -75,8 +74,8 @@ class LstmUnitKernel : public framework::OpKernel {
   }
 };
 
-template <typename Place, typename T, typename AttrType = T>
-class LstmUnitGradKernel : public framework::OpKernel {
+template <typename Place, typename T>
+class LstmUnitGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
@@ -108,7 +107,7 @@ class LstmUnitGradKernel : public framework::OpKernel {
     int N = c_tensor->dims()[0];
     int D = c_tensor->dims()[1];
 
-    auto forget_bias = static_cast<T>(ctx.Attr<AttrType>("forget_bias"));
+    auto forget_bias = static_cast<T>(ctx.Attr<float>("forget_bias"));
 
     for (int n = 0; n < N; ++n) {
       for (int d = 0; d < D; ++d) {
diff --git a/paddle/operators/margin_rank_loss_op.cc b/paddle/operators/margin_rank_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..638a99addc2119e8f44648cc54b97bd8a892d2bc
--- /dev/null
+++ b/paddle/operators/margin_rank_loss_op.cc
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/margin_rank_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MarginRankLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    // input check
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) shouldn't be null.");
+    auto label_dims = ctx->GetInputDim("Label");
+    auto x1_dims = ctx->GetInputDim("X1");
+    auto x2_dims = ctx->GetInputDim("X2");
+    PADDLE_ENFORCE(
+        (label_dims == x1_dims) && (x1_dims == x2_dims) &&
+            (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensor with shape [batch_size x 1].");
+    ctx->SetOutputDim("Activated", label_dims);
+    ctx->SetOutputDim("Out", label_dims);
+  }
+};
+
+template <typename T>
+class MarginRankLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MarginRankLossOpMaker(framework::OpProto *proto,
+                        framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X1",
+             "(2-D tensor with shape [batch_size x 1]) The score for "
+             "one item X1 to be ranked, from pairwise ranking model.");
+    AddInput("X2",
+             "(2-D tensor with shape [batch_size x 1]) The score for "
+             "another item X2 to be ranked, from pairwise ranking model.");
+    AddInput("Label",
+             "(2-D tensor with shape [batch_size x 1]) "
+             "The label indicating X1 ranked higher than X2 or not, "
+             "can only be +1 or -1.");
+    AddAttr<T>("margin", "(scalar, default 0) Margin for MarginRankLossOp.")
+        .SetDefault(static_cast<T>(0));
+    AddOutput("Activated",
+              "(2-D tensor with shape [batch_size x 1]) Intermediate tensor "
+              "to indicate whether each element of Output(Out) is activated.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(2-D tensor with shape [batch_size x 1]) "
+              "The output loss of MarginRankLoss operator.");
+    AddComment(R"DOC(
+
+MarginRankLoss operator measures the loss given a pair of training sample
+{`X1`, `X2`} and the `Label` with attribute `margin`, where `Label = +1` 
+indicating X1 is ranked higher than `X2`, otherwise `Label = -1`. The loss 
+turns out
+
+loss(X1, X2, Label) = max(0, -Label * (X1 - X2) + margin).
+
+The attribute `margin` involved here helps make the predictions more robust.
+Denote the item ranked higher as the positive sample, otherwise the negative 
+sample. If the score of the two samples satisfies 
+
+positive sample - negative sample < margin,
+
+the pair of samples will contribute to the final loss, which will backpropogate 
+and train the ranking model to enlarge the difference of the two score.
+
+For batch input with size `batch_size`, `X1`, `X2` and `Label`
+all have the same shape [batch_size x 1].
+
+)DOC");
+  }
+};
+
+class MarginRankLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X1"), "Input(X1) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X2"), "Input(X2) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shouldn't be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Activated"),
+                   "Intermediate(Activated) shouldn't be null.");
+    auto dims = ctx->GetInputDim("Label");
+    ctx->SetOutputDim(framework::GradVarName("X1"), dims);
+    ctx->SetOutputDim(framework::GradVarName("X2"), dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+
+REGISTER_OP(margin_rank_loss, ops::MarginRankLossOp,
+            ops::MarginRankLossOpMaker<float>, margin_rank_loss_grad,
+            ops::MarginRankLossGradOp);
+REGISTER_OP_CPU_KERNEL(
+    margin_rank_loss,
+    ops::MarginRankLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    margin_rank_loss_grad,
+    ops::MarginRankLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/margin_rank_loss_op.cu b/paddle/operators/margin_rank_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3a639f25d478a712c1030d57c57d7e55de1488b5
--- /dev/null
+++ b/paddle/operators/margin_rank_loss_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/margin_rank_loss_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    margin_rank_loss,
+    ops::MarginRankLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    margin_rank_loss_grad,
+    ops::MarginRankLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/margin_rank_loss_op.h b/paddle/operators/margin_rank_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d0830147ecc465909e8988e90125929829f6f34
--- /dev/null
+++ b/paddle/operators/margin_rank_loss_op.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct ReLU {
+  HOSTDEVICE T operator()(const T& val) const {
+    return val > 0 ? val : static_cast<T>(0);
+  }
+};
+
+template <typename T>
+struct Heaviside {
+  HOSTDEVICE T operator()(const T& val) const {
+    return static_cast<T>(val > 0 ? 1 : 0);
+  }
+};
+
+template <typename Place, typename T>
+class MarginRankLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* out_t = ctx.Output<framework::Tensor>("Out");
+    auto* act_t = ctx.Output<framework::Tensor>("Activated");
+
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+    auto* x1_t = ctx.Input<framework::Tensor>("X1");
+    auto* x2_t = ctx.Input<framework::Tensor>("X2");
+
+    out_t->mutable_data<T>(ctx.GetPlace());
+    act_t->mutable_data<T>(ctx.GetPlace());
+
+    auto margin = static_cast<T>(ctx.Attr<T>("margin"));
+    auto out = framework::EigenVector<T>::Flatten(*out_t);
+    auto act = framework::EigenVector<T>::Flatten(*act_t);
+
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto x1 = framework::EigenVector<T>::Flatten(*x1_t);
+    auto x2 = framework::EigenVector<T>::Flatten(*x2_t);
+
+    auto& dev = ctx.GetEigenDevice<Place>();
+    out.device(dev) = (-label * (x1 - x2) + margin).unaryExpr(ReLU<T>());
+    act.device(dev) = out.unaryExpr(Heaviside<T>());
+  }
+};
+
+template <typename Place, typename T>
+class MarginRankLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const {
+    auto* d_x1_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X1"));
+    auto* d_x2_t =
+        ctx.Output<framework::LoDTensor>(framework::GradVarName("X2"));
+
+    auto* act_t = ctx.Input<framework::Tensor>("Activated");
+    auto* d_out_t = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    auto* label_t = ctx.Input<framework::Tensor>("Label");
+
+    auto d_out = framework::EigenVector<T>::Flatten(*d_out_t);
+    auto act = framework::EigenVector<T>::Flatten(*act_t);
+    auto label = framework::EigenVector<T>::Flatten(*label_t);
+    auto& dev = ctx.GetEigenDevice<Place>();
+
+    // compute d_x1
+    if (d_x1_t) {
+      d_x1_t->mutable_data<T>(ctx.GetPlace());
+      auto d_x1 = framework::EigenVector<T>::Flatten(*d_x1_t);
+      d_x1.device(dev) = -d_out * act * label;
+    }
+    // compute d_x2
+    if (d_x2_t) {
+      d_x2_t->mutable_data<T>(ctx.GetPlace());
+      auto d_x2 = framework::EigenVector<T>::Flatten(*d_x2_t);
+      d_x2.device(dev) = d_out * act * label;
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 91ae3d49f1df51d9524547f7765285bff9dbb5c5..5598669ef96535b7d47150052b3841771c37c60b 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -1,16 +1,28 @@
+add_subdirectory(detail)
+
 if(WITH_GPU)
-    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc
-      im2col.cu DEPS cblas device_context operator)
-    nv_library(softmax_function SRCS softmax.cc softmax.cu
-      DEPS operator)
-    nv_library(cross_entropy_function SRCS cross_entropy.cc cross_entropy.cu
-      DEPS operator)
+    nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context operator)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
+    nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
+    nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
+    nv_library(softmax SRCS softmax.cc softmax.cu DEPS operator)
+    nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS operator)
+    nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
+    nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context)
+    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
+    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
 else()
-    cc_library(math_function SRCS math_function.cc im2col.cc
-      DEPS cblas device_context operator)
-    cc_library(softmax_function SRCS softmax.cc DEPS operator)
-    cc_library(cross_entropy_function SRCS cross_entropy.cc DEPS operator)
+    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context operator)
+    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
+    cc_library(softmax SRCS softmax.cc DEPS operator)
+    cc_library(cross_entropy SRCS cross_entropy.cc DEPS operator)
+    cc_library(pooling SRCS pooling.cc DEPS device_context)
+    cc_library(vol2col SRCS vol2col.cc DEPS device_context)
+    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
+    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
 endif()
 
-nv_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
+cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
+cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
diff --git a/paddle/operators/math/cross_entropy.cc b/paddle/operators/math/cross_entropy.cc
index a5a426bc7b16852e67afd790df7a91d89a458c8a..150a65f2751aaeac17f9403404d2efd990a0c72b 100644
--- a/paddle/operators/math/cross_entropy.cc
+++ b/paddle/operators/math/cross_entropy.cc
@@ -26,8 +26,8 @@ using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 template <typename T>
 class CrossEntropyFunctor<platform::CPUPlace, T> {
  public:
-  void operator()(const framework::ExecutionContext& ctx,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+                  const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel) {
     const int batch_size = prob->dims()[0];
     if (softLabel) {
@@ -35,7 +35,7 @@ class CrossEntropyFunctor<platform::CPUPlace, T> {
       auto lbl = EigenMatrix<T>::From(*labels);
       auto loss = EigenMatrix<T>::From(*out);
 
-      loss.device(ctx.GetEigenDevice<platform::CPUPlace>()) =
+      loss.device(*ctx.GetEigenDevice<platform::CPUPlace>()) =
           -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
                 .sum(Eigen::DSizes<int, 1>(1))
                 .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
diff --git a/paddle/operators/math/cross_entropy.cu b/paddle/operators/math/cross_entropy.cu
index d14a75a30c01deb86937a3ced43005aed4066d86..db878129d650d663e187ecabb106eea0e39db6fa 100644
--- a/paddle/operators/math/cross_entropy.cu
+++ b/paddle/operators/math/cross_entropy.cu
@@ -22,8 +22,6 @@ namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int* label,
                                    const int N, const int D) {
-  // TOOD(qingqing) define CUDA_1D_KERNEL_LOOP macro in a common file.
-  // CUDA_1D_KERNEL_LOOP(i, N) {
   for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
        i += blockDim.x * gridDim.x) {
     PADDLE_ASSERT(label[i] >= 0 && label[i] < D);
@@ -74,8 +72,8 @@ using Tensor = framework::Tensor;
 template <typename T>
 class CrossEntropyFunctor<platform::GPUPlace, T> {
  public:
-  void operator()(const framework::ExecutionContext& ctx,
-                  framework::Tensor* out, const framework::Tensor* prob,
+  void operator()(const platform::DeviceContext& ctx, framework::Tensor* out,
+                  const framework::Tensor* prob,
                   const framework::Tensor* labels, bool softLabel) {
     const T* prob_data = prob->data<T>();
     T* loss_data = out->mutable_data<T>(ctx.GetPlace());
@@ -87,20 +85,18 @@ class CrossEntropyFunctor<platform::GPUPlace, T> {
       const T* label_data = labels->data<T>();
       int block = class_num > 512 ? 512 : pow(2, int(std::log2(class_num)));
 
-      SoftCrossEntropyKernel<
-          T><<<batch_size, block, block * sizeof(T),
-               reinterpret_cast<const platform::CUDADeviceContext&>(
-                   ctx.device_context())
-                   .stream()>>>(loss_data, prob_data, label_data, class_num);
+      SoftCrossEntropyKernel<T><<<
+          batch_size, block, block * sizeof(T),
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+          loss_data, prob_data, label_data, class_num);
     } else {
       const int* label_data = labels->data<int>();
       int block = 512;
       int grid = (batch_size + block - 1) / block;
       CrossEntropyKernel<T><<<
-          grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
-                              ctx.device_context())
-                              .stream()>>>(loss_data, prob_data, label_data,
-                                           batch_size, class_num);
+          grid, block, 0,
+          reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+          loss_data, prob_data, label_data, batch_size, class_num);
     }
   }
 };
diff --git a/paddle/operators/math/cross_entropy.h b/paddle/operators/math/cross_entropy.h
index 18e637cf9186b5dc21e94f1ab15b3d858ec93c67..0ab6827ffa8f8b90b432a801607a97206e010cf4 100644
--- a/paddle/operators/math/cross_entropy.h
+++ b/paddle/operators/math/cross_entropy.h
@@ -37,9 +37,7 @@ struct TolerableValue {
 template <typename Place, typename T>
 class CrossEntropyFunctor {
  public:
-  // (TODO caoying) it is much better to use DeviceContext as the first
-  // parameter.
-  void operator()(const framework::ExecutionContext& context,
+  void operator()(const platform::DeviceContext& context,
                   framework::Tensor* out, const framework::Tensor* prob,
                   const framework::Tensor* labels, const bool softLabel);
 };
diff --git a/paddle/operators/math/detail/CMakeLists.txt b/paddle/operators/math/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..49cf228de2204cb4888cf645a0cb68ed04cc3371
--- /dev/null
+++ b/paddle/operators/math/detail/CMakeLists.txt
@@ -0,0 +1,5 @@
+if(WITH_AVX)
+    cc_library(activation_functions SRCS hl_cpu_functions.cc hl_avx_functions.cc)
+else()
+    cc_library(activation_functions SRCS hl_cpu_functions.cc)
+endif()
diff --git a/paddle/operators/math/detail/hl_activation_functions.h b/paddle/operators/math/detail/hl_activation_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..9d7d9914f0090bff17049038dfa2288d84f3dbda
--- /dev/null
+++ b/paddle/operators/math/detail/hl_activation_functions.h
@@ -0,0 +1,188 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_ACTIVATION_FUNCTIONS_H_
+#define HL_ACTIVATION_FUNCTIONS_H_
+
+#include "hl_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+
+/**
+ * Active functions: sigmoid, relu, tanh and linear.
+ */
+#define FLOAT_ACTIVE_FUNCTION                                   \
+  {                                                             \
+    hppl::typef::sigmoid, hppl::typef::relu, hppl::typef::tanh, \
+        hppl::typef::linear                                     \
+  }
+
+#define DOUBLE_ACTIVE_FUNCTION                                  \
+  {                                                             \
+    hppl::typed::sigmoid, hppl::typed::relu, hppl::typed::tanh, \
+        hppl::typed::linear                                     \
+  }
+
+#define AVX_ACTIVE_FUNCTION \
+  { hppl::sigmoid, hppl::relu, hppl::tanh, hppl::linear }
+
+namespace hppl {
+
+using activation_mode_t = paddle::operators::math::activation_mode_t;
+
+/**
+ * Hppl supports sigmoid, relu, tanh, linear active functions
+ * for neural networks' forward and backward activation.
+ */
+template <class T>
+class Active {
+ public:
+  typedef T (*forward)(T);
+  typedef T (*backward)(T, T);
+};
+
+template <typename T>
+struct ForwardActType;
+
+template <>
+struct ForwardActType<float> {
+  using type = Active<float>::forward;
+};
+
+template <>
+struct ForwardActType<double> {
+  using type = Active<double>::forward;
+};
+
+template <typename T>
+struct BackwardActType;
+
+template <>
+struct BackwardActType<float> {
+  using type = Active<float>::backward;
+};
+
+template <>
+struct BackwardActType<double> {
+  using type = Active<double>::backward;
+};
+
+#ifdef __NVCC__
+namespace gpu {
+static __device__ Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
+static __device__ Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
+
+static __device__ Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
+static __device__ Active<double>::backward backward_d[] =
+    DOUBLE_ACTIVE_FUNCTION;
+
+template <typename T>
+struct ForwardAct {
+  __device__ typename ForwardActType<T>::type operator()(
+      activation_mode_t type);
+};
+
+template <>
+struct ForwardAct<float> {
+  __device__ ForwardActType<float>::type operator()(activation_mode_t type) {
+    return forward[type];
+  }
+};
+
+template <>
+struct ForwardAct<double> {
+  __device__ ForwardActType<double>::type operator()(activation_mode_t type) {
+    return forward_d[type];
+  }
+};
+
+template <typename T>
+struct BackwardAct {
+  __device__ typename BackwardActType<T>::type operator()(
+      activation_mode_t type);
+};
+
+template <>
+struct BackwardAct<float> {
+  __device__ BackwardActType<float>::type operator()(activation_mode_t type) {
+    return backward[type];
+  }
+};
+
+template <>
+struct BackwardAct<double> {
+  __device__ BackwardActType<double>::type operator()(activation_mode_t type) {
+    return backward_d[type];
+  }
+};
+
+}  // namespace gpu
+#else
+namespace cpu {
+static Active<float>::forward forward[] = FLOAT_ACTIVE_FUNCTION;
+static Active<float>::backward backward[] = FLOAT_ACTIVE_FUNCTION;
+
+static Active<double>::forward forward_d[] = DOUBLE_ACTIVE_FUNCTION;
+static Active<double>::backward backward_d[] = DOUBLE_ACTIVE_FUNCTION;
+
+template <typename T>
+struct ForwardAct {
+  typename ForwardActType<T>::type operator()(activation_mode_t type);
+};
+
+template <>
+struct ForwardAct<float> {
+  ForwardActType<float>::type operator()(activation_mode_t type) {
+    return forward[type];
+  }
+};
+
+template <>
+struct ForwardAct<double> {
+  ForwardActType<double>::type operator()(activation_mode_t type) {
+    return forward_d[type];
+  }
+};
+
+template <typename T>
+struct BackwardAct {
+  typename BackwardActType<T>::type operator()(activation_mode_t type);
+};
+
+template <>
+struct BackwardAct<float> {
+  BackwardActType<float>::type operator()(activation_mode_t type) {
+    return backward[type];
+  }
+};
+
+template <>
+struct BackwardAct<double> {
+  BackwardActType<double>::type operator()(activation_mode_t type) {
+    return backward_d[type];
+  }
+};
+
+}  // namespace cpu
+
+#ifdef __AVX__
+namespace avx {
+static Active<__m256>::forward forward[] = AVX_ACTIVE_FUNCTION;
+static Active<__m256>::backward backward[] = AVX_ACTIVE_FUNCTION;
+}  // namespace avx
+#endif
+#endif
+
+}  // namespace hppl
+
+#endif  // HL_ACTIVATION_FUNCTIONS_H_
diff --git a/paddle/operators/math/detail/hl_avx_functions.cc b/paddle/operators/math/detail/hl_avx_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..415bac5d93ee00244d072b0998c6941b14d4f8d8
--- /dev/null
+++ b/paddle/operators/math/detail/hl_avx_functions.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <immintrin.h>
+#include "hl_functions.h"
+// TODO(qingqing) refine this dependence
+#include "paddle/cuda/src/avx_mathfun.h"
+
+namespace hppl {
+
+__m256 exp(__m256 a) { return exp256_ps(a); }
+
+__m256 relu(const __m256 a) {
+  __m256 tmp = _mm256_set1_ps(0.0f);
+  return _mm256_max_ps(a, tmp);
+}
+
+__m256 sigmoid(const __m256 a) {
+  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
+  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
+  __m256 tmp = _mm256_max_ps(a, min);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = _mm256_sub_ps(_mm256_set1_ps(0.0f), tmp);
+  tmp = exp(tmp);
+  tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp);
+  tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp);
+  return tmp;
+}
+
+__m256 tanh(const __m256 a) {
+  __m256 max = _mm256_set1_ps(EXP_MAX_INPUT);
+  __m256 tmp = _mm256_mul_ps(_mm256_set1_ps(-2.0f), a);
+  tmp = _mm256_min_ps(tmp, max);
+  tmp = exp(tmp);
+  return _mm256_sub_ps(_mm256_div_ps(_mm256_set1_ps(2.0f),
+                                     _mm256_add_ps(_mm256_set1_ps(1.0f), tmp)),
+                       _mm256_set1_ps(1.0f));
+}
+
+__m256 linear(const __m256 a) { return a; }
+
+__m256 relu(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_and_ps(_mm256_cmp_ps(b, _mm256_set1_ps(0.0f), _CMP_GT_OS),
+                       _mm256_set1_ps(1.0f)));
+}
+
+__m256 sigmoid(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(_mm256_mul_ps(a, b),
+                       _mm256_sub_ps(_mm256_set1_ps(1.0f), b));
+}
+
+__m256 tanh(const __m256 a, const __m256 b) {
+  return _mm256_mul_ps(
+      a, _mm256_sub_ps(_mm256_set1_ps(1.0f), _mm256_mul_ps(b, b)));
+}
+
+__m256 linear(const __m256 a, const __m256 b) { return a; }
+}  // namespace hppl
diff --git a/paddle/operators/math/detail/hl_avx_functions.h b/paddle/operators/math/detail/hl_avx_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..35f4eabb4c07c6cc9d2edded02e5b6290b1232f8
--- /dev/null
+++ b/paddle/operators/math/detail/hl_avx_functions.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_AVX_FUNCTIONS_H_
+#define HL_AVX_FUNCTIONS_H_
+
+#include <immintrin.h>
+
+namespace hppl {
+__m256 relu(const __m256 a);
+__m256 sigmoid(const __m256 a);
+__m256 tanh(const __m256 a);
+__m256 linear(const __m256 a);
+
+__m256 relu(const __m256 a, const __m256 b);
+__m256 sigmoid(const __m256 a, const __m256 b);
+__m256 tanh(const __m256 a, const __m256 b);
+__m256 linear(const __m256 a, const __m256 b);
+}  // namespace hppl
+
+#endif  // HL_AVX_FUNCTIONS_H_
diff --git a/paddle/operators/math/detail/hl_cpu_functions.cc b/paddle/operators/math/detail/hl_cpu_functions.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21ec78f9629af0e4673a56517d76ac6734f57db8
--- /dev/null
+++ b/paddle/operators/math/detail/hl_cpu_functions.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <math.h>
+#include "hl_functions.h"
+
+namespace hppl {
+namespace typef {
+
+float relu(const float a) {
+  return a > static_cast<float>(0.0) ? a : static_cast<float>(0.0);
+}
+
+float sigmoid(const float a) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  float tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<float>(1.0) / (static_cast<float>(1.0) + exp(-tmp));
+}
+
+float tanh(const float a) {
+  float tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+float linear(const float a) { return a; }
+
+float relu(const float a, const float b) { return a * (b > 0.0 ? 1.0 : 0.0); }
+
+float sigmoid(const float a, const float b) {
+  return a * b * (static_cast<float>(1) - b);
+}
+
+float tanh(const float a, const float b) {
+  return a * (static_cast<float>(1) - b * b);
+}
+
+float linear(const float a, const float b) { return a; }
+
+}  // namespace typef
+
+namespace typed {
+double relu(const double a) {
+  return a > static_cast<double>(0.0) ? a : static_cast<double>(0.0);
+}
+
+double sigmoid(const double a) {
+  const double min = SIGMOID_THRESHOLD_MIN;
+  const double max = SIGMOID_THRESHOLD_MAX;
+  double tmp = (a < min) ? min : ((a > max) ? max : a);
+  return static_cast<double>(1.0) / (static_cast<double>(1.0) + exp(-tmp));
+}
+
+double tanh(const double a) {
+  double tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(tmp))) - 1.0;
+}
+
+double linear(const double a) { return a; }
+
+double relu(const double a, const double b) {
+  return a * (b > 0.0 ? 1.0 : 0.0);
+}
+
+double sigmoid(const double a, const double b) {
+  return a * b * (static_cast<double>(1) - b);
+}
+
+double tanh(const double a, const double b) {
+  return a * (static_cast<double>(1) - b * b);
+}
+
+double linear(const double a, const double b) { return a; }
+
+}  // namespace typed
+}  // namespace hppl
diff --git a/paddle/operators/math/detail/hl_functions.h b/paddle/operators/math/detail/hl_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..3e2f0c9ee6d3ae2ed598c4d5f09b85b7d61fdd51
--- /dev/null
+++ b/paddle/operators/math/detail/hl_functions.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_FUNCTIONS_H_
+#define HL_FUNCTIONS_H_
+
+/**
+ * sigmoid threshold maximum
+ */
+#define SIGMOID_THRESHOLD_MIN -40.0
+
+/**
+ * sigmoid threshold minimum
+ */
+#define SIGMOID_THRESHOLD_MAX 13.0
+
+/**
+ * The maximum input value for exp, used to avoid overflow problem.
+ * currently only used for tanh function.
+ */
+#define EXP_MAX_INPUT 40.0
+
+#ifndef __NVCC__
+namespace hppl {
+namespace typef {
+float relu(const float a);
+float sigmoid(const float a);
+float tanh(const float a);
+float linear(const float a);
+
+float relu(const float a, const float b);
+float sigmoid(const float a, const float b);
+float tanh(const float a, const float b);
+float linear(const float a, const float b);
+
+}  // namespace typef
+
+namespace typed {
+double relu(const double a);
+double sigmoid(const double a);
+double tanh(const double a);
+double linear(const double a);
+
+double relu(const double a, const double b);
+double sigmoid(const double a, const double b);
+double tanh(const double a, const double b);
+double linear(const double a, const double b);
+}  // namespace typed
+
+}  // namespace hppl
+
+#ifdef __AVX__
+#include "hl_avx_functions.h"
+#endif
+
+#else
+#include "hl_gpu_functions.h"
+#endif
+
+#endif  // HL_FUNCTIONS_H_
diff --git a/paddle/operators/math/detail/hl_gpu_functions.h b/paddle/operators/math/detail/hl_gpu_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..72f2204e7b2cfdba1367b51e3731dde11fb292d6
--- /dev/null
+++ b/paddle/operators/math/detail/hl_gpu_functions.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef HL_GPU_FUNCTIONS_CUH_
+#define HL_GPU_FUNCTIONS_CUH_
+
+#include "hl_base.h"
+
+namespace hppl {
+namespace typef {
+
+__device__ static float relu(const float a) { return a > 0.0f ? a : 0.0f; }
+
+__device__ static float sigmoid(const float a) {
+  const float min = SIGMOID_THRESHOLD_MIN;
+  const float max = SIGMOID_THRESHOLD_MAX;
+  float tmp = (a < min) ? min : ((a > max) ? max : a);
+  return __fdividef(1.0f, 1.0f + __expf(-tmp));
+}
+
+__device__ static float tanh(const float a) {
+  float tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return __fdividef(2.0f, (1.0f + __expf(-2.0f * tmp))) - 1.0f;
+}
+
+__device__ static float linear(const float a) { return a; }
+
+__device__ static float relu(const float a, const float b) {
+  return a * (b > 0.0f ? 1.0f : 0.0f);
+}
+
+__device__ static float sigmoid(const float a, const float b) {
+  return a * b * (1.0f - b);
+}
+
+__device__ static float tanh(const float a, const float b) {
+  return a * (1.0f - b * b);
+}
+
+__device__ static float linear(const float a, const float b) { return a; }
+
+}  // namespace typef
+
+namespace typed {
+
+__device__ static double relu(const double a) { return a > 0.0 ? a : 0.0; }
+
+__device__ static double sigmoid(const double a) {
+  const double min = SIGMOID_THRESHOLD_MIN;
+  const double max = SIGMOID_THRESHOLD_MAX;
+  double tmp = (a < min) ? min : ((a > max) ? max : a);
+  return 1.0 / (1.0 + exp(-tmp));
+}
+
+__device__ static double tanh(const double a) {
+  double tmp = -2.0 * a;
+  tmp = (tmp > EXP_MAX_INPUT) ? EXP_MAX_INPUT : tmp;
+  return (2.0 / (1.0 + exp(-2.0 * a))) - 1.0;
+}
+
+__device__ static double linear(const double a) { return a; }
+
+__device__ static double relu(const double a, const double b) {
+  return a * (b > 0.0 ? 1.0 : 0.0);
+}
+
+__device__ static double sigmoid(const double a, const double b) {
+  return a * b * (1 - b);
+}
+
+__device__ static double tanh(const double a, const double b) {
+  return a * (1.0 - b * b);
+}
+
+__device__ static double linear(const double a, const double b) { return a; }
+
+}  // namespace typef
+
+}  // namespace hppl
+
+#endif  // HL_GPU_FUNCTIONS_CUH_
diff --git a/paddle/operators/math/detail/lstm_cpu_kernel.h b/paddle/operators/math/detail/lstm_cpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..74d51d7bc9b91f4c8088384d77183131f57aafab
--- /dev/null
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -0,0 +1,310 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/hl_activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+#ifndef __NVCC__
+
+template <class T, class Op>
+void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                     int frameSize,
+                                     activation_mode_t active_node,
+                                     activation_mode_t active_gate,
+                                     activation_mode_t active_state) {
+  T rValueIn;
+  T rValueIg;
+  T rValueFg;
+  T rValueOg;
+  T rCheckI;
+  T rCheckF;
+  T rCheckO;
+  T rState;
+  T rPrevState = 0;
+  T rStateAtv;
+  T rOut;
+
+  T *valueIn = value.gateValue;
+  T *valueIg = value.gateValue + frameSize;
+  T *valueFg = value.gateValue + frameSize * 2;
+  T *valueOg = value.gateValue + frameSize * 3;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = value.checkIg[i];
+    rCheckF = value.checkFg[i];
+    rCheckO = value.checkOg[i];
+
+    if (value.prevStateValue) {
+      rPrevState = value.prevStateValue[i];
+    }
+
+    hppl::cpu::ForwardAct<T> act;
+    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
+       rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
+       act(active_state));
+
+    valueIn[i] = rValueIn;
+    valueIg[i] = rValueIg;
+    valueFg[i] = rValueFg;
+    valueOg[i] = rValueOg;
+    value.stateValue[i] = rState;
+    value.stateActiveValue[i] = rStateAtv;
+    value.outputValue[i] = rOut;
+  }
+}
+
+template <class T, class Op>
+void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
+                                      LstmMetaGrad<T> grad, int frameSize,
+                                      activation_mode_t active_node,
+                                      activation_mode_t active_gate,
+                                      activation_mode_t active_state) {
+  T rValueIn;
+  T rValueIg;
+  T rValueFg;
+  T rValueOg;
+  T rGradIn;
+  T rGradIg;
+  T rGradFg;
+  T rGradOg;
+  T rPrevState = 0;
+  T rPrevStateGrad;
+  T rState;
+  T rStateGrad;
+  T rStateAtv;
+  T rOutputGrad;
+  T rCheckI;
+  T rCheckF;
+  T rCheckO;
+  T rCheckIGrad;
+  T rCheckFGrad;
+  T rCheckOGrad;
+
+  T *valueIn = value.gateValue;
+  T *valueIg = value.gateValue + frameSize;
+  T *valueFg = value.gateValue + frameSize * 2;
+  T *valueOg = value.gateValue + frameSize * 3;
+  T *gradIn = grad.gateGrad;
+  T *gradIg = grad.gateGrad + frameSize;
+  T *gradFg = grad.gateGrad + frameSize * 2;
+  T *gradOg = grad.gateGrad + frameSize * 3;
+
+  for (int i = 0; i < frameSize; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = value.checkIg[i];
+    rCheckF = value.checkFg[i];
+    rCheckO = value.checkOg[i];
+    rState = value.stateValue[i];
+    rStateAtv = value.stateActiveValue[i];
+    rOutputGrad = grad.outputGrad[i];
+    rStateGrad = grad.stateGrad[i];
+    if (value.prevStateValue) {
+      rPrevState = value.prevStateValue[i];
+    }
+
+    hppl::cpu::BackwardAct<T> act;
+    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
+       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
+       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
+       rCheckOGrad, act(active_node), act(active_gate), act(active_state));
+
+    gradIn[i] = rGradIn;
+    gradIg[i] = rGradIg;
+    gradFg[i] = rGradFg;
+    gradOg[i] = rGradOg;
+    grad.stateGrad[i] = rStateGrad;
+
+    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
+      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
+  }
+}
+
+template <class T, class Op>
+void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
+                                   activation_mode_t active_node,
+                                   activation_mode_t active_gate,
+                                   activation_mode_t active_state) {
+#ifdef __AVX__
+  __m256 rValueIn;
+  __m256 rValueIg;
+  __m256 rValueFg;
+  __m256 rValueOg;
+  __m256 rCheckI;
+  __m256 rCheckF;
+  __m256 rCheckO;
+  __m256 rState;
+  __m256 rPrevState = _mm256_set1_ps(0.0f);
+  __m256 rStateAtv;
+  __m256 rOut;
+
+  __m256 *valueIn = (__m256 *)value.gateValue;
+  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
+  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
+  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = ((__m256 *)value.checkIg)[i];
+    rCheckF = ((__m256 *)value.checkFg)[i];
+    rCheckO = ((__m256 *)value.checkOg)[i];
+
+    if (value.prevStateValue) {
+      rPrevState = ((__m256 *)value.prevStateValue)[i];
+    }
+
+    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
+       rOut, rCheckI, rCheckF, rCheckO, hppl::avx::forward[active_node],
+       hppl::avx::forward[active_gate], hppl::avx::forward[active_state]);
+
+    valueIn[i] = rValueIn;
+    valueIg[i] = rValueIg;
+    valueFg[i] = rValueFg;
+    valueOg[i] = rValueOg;
+    ((__m256 *)value.stateValue)[i] = rState;
+    ((__m256 *)value.stateActiveValue)[i] = rStateAtv;
+    ((__m256 *)value.outputValue)[i] = rOut;
+  }
+#endif
+}
+
+template <class T, class Op>
+void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
+                                    LstmMetaGrad<T> grad, int frameSize,
+                                    activation_mode_t active_node,
+                                    activation_mode_t active_gate,
+                                    activation_mode_t active_state) {
+#ifdef __AVX__
+  __m256 rValueIn;
+  __m256 rValueIg;
+  __m256 rValueFg;
+  __m256 rValueOg;
+  __m256 rGradIn;
+  __m256 rGradIg;
+  __m256 rGradFg;
+  __m256 rGradOg;
+  __m256 rPrevState = _mm256_set1_ps(0.0f);
+  __m256 rPrevStateGrad;
+  __m256 rStateGrad;
+  __m256 rState;
+  __m256 rStateAtv;
+  __m256 rOutputGrad;
+  __m256 rCheckI;
+  __m256 rCheckF;
+  __m256 rCheckO;
+  __m256 rCheckIGrad;
+  __m256 rCheckFGrad;
+  __m256 rCheckOGrad;
+
+  __m256 *valueIn = (__m256 *)value.gateValue;
+  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
+  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
+  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
+  __m256 *gradIn = (__m256 *)grad.gateGrad;
+  __m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
+  __m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
+  __m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
+
+  for (int i = 0; i < frameSize / 8; i++) {
+    rValueIn = valueIn[i];
+    rValueIg = valueIg[i];
+    rValueFg = valueFg[i];
+    rValueOg = valueOg[i];
+    rCheckI = ((__m256 *)value.checkIg)[i];
+    rCheckF = ((__m256 *)value.checkFg)[i];
+    rCheckO = ((__m256 *)value.checkOg)[i];
+    rState = ((__m256 *)value.stateValue)[i];
+    rStateAtv = ((__m256 *)value.stateActiveValue)[i];
+    rOutputGrad = ((__m256 *)grad.outputGrad)[i];
+    rStateGrad = ((__m256 *)grad.stateGrad)[i];
+    if (value.prevStateValue) {
+      rPrevState = ((__m256 *)value.prevStateValue)[i];
+    }
+
+    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
+       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
+       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
+       rCheckOGrad, hppl::avx::backward[active_node],
+       hppl::avx::backward[active_gate], hppl::avx::backward[active_state]);
+
+    gradIn[i] = rGradIn;
+    gradIg[i] = rGradIg;
+    gradFg[i] = rGradFg;
+    gradOg[i] = rGradOg;
+    ((__m256 *)grad.stateGrad)[i] = rStateGrad;
+
+    if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
+      if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
+  }
+#endif
+}
+
+template <class T, class Op>
+void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate,
+                      activation_mode_t active_state) {
+  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+                                     active_gate, active_state);
+  } else {
+    naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+                                       active_gate, active_state);
+  }
+}
+
+template <class T, class Op>
+void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                       int frameSize, activation_mode_t active_node,
+                       activation_mode_t active_gate,
+                       activation_mode_t active_state) {
+  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
+                                      active_gate, active_state);
+  } else {
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
+                                        active_gate, active_state);
+  }
+}
+
+#endif
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_gpu_kernel.h b/paddle/operators/math/detail/lstm_gpu_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..9573eaefb6a9d678ef70f2e2bffdc6a3011b21ea
--- /dev/null
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <type_traits>
+#include "paddle/operators/math/detail/hl_activation_functions.h"
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/platform/cuda_helper.h"
+#include "paddle/platform/device_context.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class T, class Op, bool isBatch>
+__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
+                              int batchSize, activation_mode_t active_node,
+                              activation_mode_t active_gate,
+                              activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.outputValue += batchIdx * frameSize;
+    value.stateValue += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+  }
+
+  T rState;
+  T rPrevState = 0;
+  T rStateAtv;
+  T rOut;
+  T rValueIn;
+  T rValueIg;
+  T rValueFg;
+  T rValueOg;
+  T rCheckI = value.checkIg[frameIdx];
+  T rCheckF = value.checkFg[frameIdx];
+  T rCheckO = value.checkOg[frameIdx];
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  hppl::gpu::ForwardAct<T> act;
+  op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
+     rOut, rCheckI, rCheckF, rCheckO, act(active_node), act(active_gate),
+     act(active_state));
+
+  value.gateValue[frameIdx] = rValueIn;
+  value.gateValue[frameIdx + frameSize] = rValueIg;
+  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
+  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+
+  value.stateValue[frameIdx] = rState;
+  value.stateActiveValue[frameIdx] = rStateAtv;
+  value.outputValue[frameIdx] = rOut;
+}
+
+/*
+ * threads(framePerBlock, batchPerBlock)
+ * grid(frameBlocks, batchBlocks)
+ */
+template <class T, class Op, bool isBatch>
+__global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
+                               LstmMetaGrad<T> grad, int frameSize,
+                               int batchSize, activation_mode_t active_node,
+                               activation_mode_t active_gate,
+                               activation_mode_t active_state) {
+  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frameIdx >= frameSize) return;
+
+  int batchIdx = 0;
+  if (isBatch) {
+    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batchIdx >= batchSize) return;
+    value.gateValue += batchIdx * frameSize * 4;
+    value.stateValue += batchIdx * frameSize;
+    value.stateActiveValue += batchIdx * frameSize;
+    grad.gateGrad += batchIdx * frameSize * 4;
+    grad.stateGrad += batchIdx * frameSize;
+    grad.outputGrad += batchIdx * frameSize;
+  }
+
+  T rValueIn;
+  T rValueIg;
+  T rValueFg;
+  T rValueOg;
+  T rGradIn;
+  T rGradIg;
+  T rGradFg;
+  T rGradOg;
+  T rPrevState = 0;
+  T rPrevStateGrad;
+  T rState;
+  T rStateGrad;
+  T rStateAtv;
+  T rOutputGrad;
+  T rCheckI = value.checkIg[frameIdx];
+  T rCheckF = value.checkFg[frameIdx];
+  T rCheckO = value.checkOg[frameIdx];
+  T rCheckIGrad;
+  T rCheckFGrad;
+  T rCheckOGrad;
+
+  rValueIn = value.gateValue[frameIdx];
+  rValueIg = value.gateValue[frameIdx + frameSize];
+  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  rState = value.stateValue[frameIdx];
+  rStateAtv = value.stateActiveValue[frameIdx];
+  rOutputGrad = grad.outputGrad[frameIdx];
+  rStateGrad = grad.stateGrad[frameIdx];
+
+  if (value.prevStateValue) {
+    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    rPrevState = value.prevStateValue[frameIdx];
+  }
+
+  hppl::gpu::BackwardAct<T> act;
+  op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
+     rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
+     rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
+     act(active_node), act(active_gate), act(active_state));
+
+  grad.gateGrad[frameIdx] = rGradIn;
+  grad.gateGrad[frameIdx + frameSize] = rGradIg;
+  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
+  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
+  grad.stateGrad[frameIdx] = rStateGrad;
+  if (grad.prevStateGrad) {
+    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
+    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+  }
+
+  if (isBatch) {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad)
+        paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
+                                        rCheckIGrad);
+      if (grad.checkFgGrad)
+        paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
+                                        rCheckFGrad);
+    }
+    if (grad.checkOgGrad)
+      paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
+  } else {
+    if (value.prevStateValue) {
+      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
+      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+    }
+    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+  }
+}
+
+template <class T, class Op>
+void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
+                      LstmMetaValue<T> value, int frameSize, int batchSize,
+                      activation_mode_t active_node,
+                      activation_mode_t active_gate,
+                      activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+  if (batchSize == 1) {
+    KeLstmForward<T, Op,
+                  /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, frameSize, batchSize, active_node, active_gate,
+        active_state);
+  } else {
+    KeLstmForward<T, Op,
+                  /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, frameSize, batchSize, active_node, active_gate,
+        active_state);
+  }
+}
+
+template <class T, class Op>
+void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
+                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                       int frameSize, int batchSize,
+                       activation_mode_t active_node,
+                       activation_mode_t active_gate,
+                       activation_mode_t active_state) {
+  dim3 threads;
+  dim3 grid;
+  if (batchSize == 1) {
+    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    threads = dim3(framePerBlock, 1);
+    grid = dim3(frameBlocks, 1);
+  } else {
+    /* framePerBlock = 32 batchPerBlock = 32 */
+    threads = dim3(32, 32);
+    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+  }
+
+  auto stream =
+      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+  if (batchSize == 1) {
+    KeLstmBackward<T, Op,
+                   /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+        op, value, grad, frameSize, batchSize, active_node, active_gate,
+        active_state);
+  } else {
+    KeLstmBackward<T, Op,
+                   /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+        op, value, grad, frameSize, batchSize, active_node, active_gate,
+        active_state);
+  }
+}
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/detail/lstm_kernel.h b/paddle/operators/math/detail/lstm_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f3ead2397d5131b4468d0ad288513cedb289594
--- /dev/null
+++ b/paddle/operators/math/detail/lstm_kernel.h
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/hl_activation_functions.h"
+#include "paddle/platform/hostdevice.h"
+
+#include <type_traits>
+
+namespace paddle {
+namespace operators {
+namespace math {
+namespace detail {
+
+namespace forward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
+                             T &prevState, T &state, T &stateAtv, T &output,
+                             T &checkI, T &checkF, T &checkO,
+                             typename hppl::ForwardActType<T>::type actInput,
+                             typename hppl::ForwardActType<T>::type actGate,
+                             typename hppl::ForwardActType<T>::type actState) {
+    valueIn = actInput(valueIn);
+    valueIg = actGate(valueIg + prevState * checkI);
+    valueFg = actGate(valueFg + prevState * checkF);
+    state = valueIn * valueIg + prevState * valueFg;
+    valueOg = actGate(valueOg + state * checkO);
+    stateAtv = actState(state);
+    output = valueOg * stateAtv;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+
+  HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
+                             __m256 &valueOg, __m256 &prevState, __m256 &state,
+                             __m256 &stateAtv, __m256 &output, __m256 &checkI,
+                             __m256 &checkF, __m256 &checkO,
+                             hppl::Active<__m256>::forward actInput,
+                             hppl::Active<__m256>::forward actGate,
+                             hppl::Active<__m256>::forward actState) {
+    valueIn = actInput(valueIn);
+    valueIg = actGate(_mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)));
+    valueFg = actGate(_mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)));
+    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
+                          _mm256_mul_ps(prevState, valueFg));
+    valueOg = actGate(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)));
+    stateAtv = actState(state);
+    output = _mm256_mul_ps(valueOg, stateAtv);
+  }
+#endif
+#endif
+};
+
+}  // namespace forward
+
+namespace backward {
+
+template <class T>
+class lstm {
+ public:
+  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
+                             T &gradIn, T &gradIg, T &gradFg, T &gradOg,
+                             T &prevState, T &prevStateGrad, T &state,
+                             T &stateGrad, T &stateAtv, T &outputGrad,
+                             T &checkI, T &checkF, T &checkO, T &checkIGrad,
+                             T &checkFGrad, T &checkOGrad,
+                             typename hppl::BackwardActType<T>::type actInput,
+                             typename hppl::BackwardActType<T>::type actGate,
+                             typename hppl::BackwardActType<T>::type actState) {
+    gradOg = actGate(outputGrad * stateAtv, valueOg);
+    stateGrad += actState(outputGrad * valueOg, stateAtv) + gradOg * checkO;
+    gradIn = actInput(stateGrad * valueIg, valueIn);
+    gradIg = actGate(stateGrad * valueIn, valueIg);
+    gradFg = actGate(stateGrad * prevState, valueFg);
+    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
+    checkIGrad = gradIg * prevState;
+    checkFGrad = gradFg * prevState;
+    checkOGrad = gradOg * state;
+  }
+#ifndef __NVCC__
+#ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
+  static const bool avx = false;
+#else
+  // Only float support AVX optimization
+  static const bool avx = std::is_same<T, float>::value;
+  HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
+                             __m256 &valueOg, __m256 &gradIn, __m256 &gradIg,
+                             __m256 &gradFg, __m256 &gradOg, __m256 &prevState,
+                             __m256 &prevStateGrad, __m256 &state,
+                             __m256 &stateGrad, __m256 &stateAtv,
+                             __m256 &outputGrad, __m256 &checkI, __m256 &checkF,
+                             __m256 &checkO, __m256 &checkIGrad,
+                             __m256 &checkFGrad, __m256 &checkOGrad,
+                             hppl::Active<__m256>::backward actInput,
+                             hppl::Active<__m256>::backward actGate,
+                             hppl::Active<__m256>::backward actState) {
+    gradOg = actGate(_mm256_mul_ps(outputGrad, stateAtv), valueOg);
+    stateGrad = _mm256_add_ps(
+        actState(_mm256_mul_ps(outputGrad, valueOg), stateAtv), stateGrad);
+    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
+    gradIn = actInput(_mm256_mul_ps(stateGrad, valueIg), valueIn);
+    gradIg = actGate(_mm256_mul_ps(stateGrad, valueIn), valueIg);
+    gradFg = actGate(_mm256_mul_ps(stateGrad, prevState), valueFg);
+    prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
+                                  _mm256_mul_ps(gradFg, checkF));
+    prevStateGrad =
+        _mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
+    checkIGrad = _mm256_mul_ps(gradIg, prevState);
+    checkFGrad = _mm256_mul_ps(gradFg, prevState);
+    checkOGrad = _mm256_mul_ps(gradOg, state);
+  }
+#endif
+#endif
+};
+
+}  // namespace backward
+
+}  // namespace detail
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/im2col.cc b/paddle/operators/math/im2col.cc
index c08a3380f042886cd400df0d840e61856274619c..3b1b0bd71dd3768b932864e185af8dc839b4653e 100644
--- a/paddle/operators/math/im2col.cc
+++ b/paddle/operators/math/im2col.cc
@@ -29,8 +29,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width) {
+                  int stride_height, int stride_width, int padding_up,
+                  int padding_down, int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -41,6 +41,22 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int filter_width = col.dims()[2];
     int output_height = col.dims()[3];
     int output_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ(
+        (input_height + padding_up + padding_down - filter_height) /
+                stride_height +
+            1,
+        output_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (input_width + padding_left + padding_right - filter_width) /
+                stride_width +
+            1,
+        output_width,
+        "output_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
     int channels_col = input_channels * filter_height * filter_width;
 
     const T* im_data = im.data<T>();
@@ -52,16 +68,14 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
       int c_im = c / filter_width / filter_height;
       for (int h = 0; h < output_height; ++h) {
         for (int w = 0; w < output_width; ++w) {
-          int im_row_idx = h * stride_height + h_offset;
-          int im_col_idx = w * stride_width + w_offset;
-          if ((im_row_idx - padding_height) < 0 ||
-              (im_row_idx - padding_height) >= input_height ||
-              (im_col_idx - padding_width) < 0 ||
-              (im_col_idx - padding_width) >= input_width) {
+          int im_row_idx = h * stride_height + h_offset - padding_up;
+          int im_col_idx = w * stride_width + w_offset - padding_left;
+
+          if (im_row_idx < 0 || im_row_idx >= input_height || im_col_idx < 0 ||
+              im_col_idx >= input_width) {
             col_data[(c * output_height + h) * output_width + w] = T(0);
           } else {
-            im_row_idx += c_im * input_height - padding_height;
-            im_col_idx -= padding_width;
+            im_row_idx += c_im * input_height;
             col_data[(c * output_height + h) * output_width + w] =
                 im_data[im_row_idx * input_width + im_col_idx];
           }
@@ -82,7 +96,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(const platform::DeviceContext& context, framework::Tensor& im,
                   const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_height, int padding_width) {
+                  int stride_width, int padding_up, int padding_down,
+                  int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -92,6 +107,22 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     int filter_width = col.dims()[2];
     int output_height = col.dims()[3];
     int output_width = col.dims()[4];
+
+    PADDLE_ENFORCE_EQ(
+        (input_height + padding_up + padding_down - filter_height) /
+                stride_height +
+            1,
+        output_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (input_width + padding_left + padding_right - filter_width) /
+                stride_width +
+            1,
+        output_width,
+        "output_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
     int channels_col = input_channels * filter_height * filter_width;
 
     T* im_data = im.data<T>();
@@ -103,14 +134,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
       int c_im = c / filter_width / filter_height;
       for (int h = 0; h < output_height; ++h) {
         for (int w = 0; w < output_width; ++w) {
-          int im_row_idx = h * stride_height + h_offset;
-          int im_col_idx = w * stride_width + w_offset;
-          if ((im_row_idx - padding_height) >= 0 &&
-              (im_row_idx - padding_height) < input_height &&
-              (im_col_idx - padding_width) >= 0 &&
-              (im_col_idx - padding_width) < input_width) {
-            im_row_idx += c_im * input_height - padding_height;
-            im_col_idx -= padding_width;
+          int im_row_idx = h * stride_height + h_offset - padding_up;
+          int im_col_idx = w * stride_width + w_offset - padding_left;
+
+          if ((im_row_idx) >= 0 && (im_row_idx) < input_height &&
+              (im_col_idx) >= 0 && (im_col_idx) < input_width) {
+            im_row_idx += c_im * input_height;
             im_data[im_row_idx * input_width + im_col_idx] +=
                 col_data[(c * output_height + h) * output_width + w];
           }
@@ -140,8 +169,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width) {
+                  int stride_height, int stride_width, int padding_up,
+                  int padding_down, int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -152,6 +181,21 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int output_height = col.dims()[0];
     int output_width = col.dims()[1];
 
+    PADDLE_ENFORCE_EQ(
+        (input_height + padding_up + padding_down - filter_height) /
+                stride_height +
+            1,
+        output_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (input_width + padding_left + padding_right - filter_width) /
+                stride_width +
+            1,
+        output_width,
+        "output_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
     const T* im_data = im.data<T>();
     T* col_data = col.data<T>();
 
@@ -163,10 +207,10 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
               int im_row_offset =
-                  col_row_idx * stride_height + filter_row_idx - padding_height;
+                  col_row_idx * stride_height + filter_row_idx - padding_up;
               int im_col_offset =
-                  col_col_idx * stride_width + filter_col_idx - padding_width;
-              int col_offset = (((col_row_idx * output_width + col_col_idx) *
+                  col_col_idx * stride_width + filter_col_idx - padding_left;
+              int col_offset = ((((col_row_idx)*output_width + col_col_idx) *
                                      input_channels +
                                  channel) *
                                     filter_height +
@@ -201,7 +245,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(const platform::DeviceContext& context, framework::Tensor& im,
                   const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_height, int padding_width) {
+                  int stride_width, int padding_up, int padding_down,
+                  int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -212,6 +257,21 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     int output_height = col.dims()[0];
     int output_width = col.dims()[1];
 
+    PADDLE_ENFORCE_EQ(
+        (input_height + padding_up + padding_down - filter_height) /
+                stride_height +
+            1,
+        output_height,
+        "Output_height and padding(padding_up, padding_down) are "
+        "inconsistent.");
+    PADDLE_ENFORCE_EQ(
+        (input_width + padding_left + padding_right - filter_width) /
+                stride_width +
+            1,
+        output_width,
+        "output_width and padding(padding_left, padding_right) are "
+        "inconsistent.");
+
     T* im_data = im.data<T>();
     const T* col_data = col.data<T>();
 
@@ -223,9 +283,9 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
             for (int filter_col_idx = 0; filter_col_idx < filter_width;
                  ++filter_col_idx) {
               int im_row_offset =
-                  col_row_idx * stride_height + filter_row_idx - padding_height;
+                  col_row_idx * stride_height + filter_row_idx - padding_up;
               int im_col_offset =
-                  col_col_idx * stride_width + filter_col_idx - padding_width;
+                  col_col_idx * stride_width + filter_col_idx - padding_left;
               int col_offset = (((col_row_idx * output_width + col_col_idx) *
                                      input_channels +
                                  channel) *
diff --git a/paddle/operators/math/im2col.cu b/paddle/operators/math/im2col.cu
index 01f60bfe70f844fdcfd5aa481c27d9f12ec51305..7b201fdbf3c5dd7d336d359e00b7323cecc0231a 100644
--- a/paddle/operators/math/im2col.cu
+++ b/paddle/operators/math/im2col.cu
@@ -66,8 +66,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width) {
+                  int stride_height, int stride_width, int padding_up,
+                  int padding_down, int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -79,6 +79,15 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
     int output_height = col.dims()[3];
     int output_width = col.dims()[4];
 
+    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
+                           stride_height +
+                       1 ==
+                   output_height);
+    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
+                           stride_width +
+                       1 ==
+                   output_width);
+
     int num_outputs = input_channels * output_height * output_width;
     int blocks = (num_outputs + 1024 - 1) / 1024;
     int block_x = 512;
@@ -89,8 +98,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
                     .stream()>>>(
         im.data<T>(), num_outputs, input_height, input_width, filter_height,
-        filter_width, stride_height, stride_width, padding_height,
-        padding_width, output_height, output_width, col.data<T>());
+        filter_width, stride_height, stride_width, padding_up, padding_left,
+        output_height, output_width, col.data<T>());
   }
 };
 
@@ -152,7 +161,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  public:
   void operator()(const platform::DeviceContext& context, framework::Tensor& im,
                   const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_height, int padding_width) {
+                  int stride_width, int padding_up, int padding_down,
+                  int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
 
@@ -164,8 +174,18 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     int output_height = col.dims()[3];
     int output_width = col.dims()[4];
 
-    size_t num_kernels = input_channels * (input_height + 2 * padding_height) *
-                         (input_width + 2 * padding_width);
+    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
+                           stride_height +
+                       1 ==
+                   output_height);
+    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
+                           stride_width +
+                       1 ==
+                   output_width);
+
+    size_t num_kernels = input_channels *
+                         (input_height + padding_up + padding_down) *
+                         (input_width + padding_left + padding_right);
 
     size_t blocks = (num_kernels + 1024 - 1) / 1024;
     size_t block_x = 512;
@@ -178,10 +198,10 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
     col2im<T><<<grid, threads, 0,
                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
                     .stream()>>>(
-        num_kernels, col.data<T>(), input_height + 2 * padding_height,
-        input_width + 2 * padding_width, input_channels, filter_height,
-        filter_width, stride_height, stride_width, padding_height,
-        padding_width, output_height, output_width, im.data<T>());
+        num_kernels, col.data<T>(), input_height + padding_up + padding_down,
+        input_width + padding_left + padding_left, input_channels,
+        filter_height, filter_width, stride_height, stride_width, padding_up,
+        padding_left, output_height, output_width, im.data<T>());
   }
 };
 
@@ -238,8 +258,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width) {
+                  int stride_height, int stride_width, int padding_up,
+                  int padding_down, int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -250,6 +270,15 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
     int output_height = col.dims()[0];
     int output_width = col.dims()[1];
 
+    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
+                           stride_height +
+                       1 ==
+                   output_height);
+    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
+                           stride_width +
+                       1 ==
+                   output_width);
+
     int block_dim_x = 0;
     int block_dim_y = 0;
     if (filter_height <= 4 && filter_width <= 4) {
@@ -274,8 +303,8 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                    reinterpret_cast<const platform::CUDADeviceContext&>(context)
                        .stream()>>>(
         im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
-        filter_height, filter_width, stride_height, stride_width,
-        padding_height, padding_width, output_height, output_width);
+        filter_height, filter_width, stride_height, stride_width, padding_up,
+        padding_left, output_height, output_width);
   }
 };
 
@@ -322,7 +351,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  public:
   void operator()(const platform::DeviceContext& context, framework::Tensor& im,
                   const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_height, int padding_width) {
+                  int stride_width, int padding_up, int padding_down,
+                  int padding_left, int padding_right) {
     PADDLE_ENFORCE(im.dims().size() == 3);
     PADDLE_ENFORCE(col.dims().size() == 5);
     int input_channels = im.dims()[0];
@@ -333,6 +363,15 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
     int output_height = col.dims()[0];
     int output_width = col.dims()[1];
 
+    PADDLE_ENFORCE((input_height + padding_up + padding_down - filter_height) /
+                           stride_height +
+                       1 ==
+                   output_height);
+    PADDLE_ENFORCE((input_width + padding_left + padding_right - filter_width) /
+                           stride_width +
+                       1 ==
+                   output_width);
+
     int block_dim_x = 0;
     int block_dim_y = 0;
     if (filter_height <= 4 && filter_width <= 4) {
@@ -357,8 +396,8 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                    reinterpret_cast<const platform::CUDADeviceContext&>(context)
                        .stream()>>>(
         im.data<T>(), col.data<T>(), input_channels, input_height, input_width,
-        filter_height, filter_width, stride_height, stride_width,
-        padding_height, padding_width, output_height, output_width);
+        filter_height, filter_width, stride_height, stride_width, padding_up,
+        padding_left, output_height, output_width);
   }
 };
 
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index 7b717e1603c94cd77c74cb0d86f1d23e2692f9d8..c736d4fa523c2af3e3dd7a11114d7f84021bc5c1 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -74,8 +74,8 @@ class Im2ColFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
                   const framework::Tensor& im, framework::Tensor& col,
-                  int stride_height, int stride_width, int padding_height,
-                  int padding_width);
+                  int stride_height, int stride_width, int padding_up,
+                  int padding_down, int padding_left, int padding_right);
 };
 
 template <ColFormat Format, typename Place, typename T>
@@ -83,7 +83,8 @@ class Col2ImFunctor {
  public:
   void operator()(const platform::DeviceContext& context, framework::Tensor& im,
                   const framework::Tensor& col, int stride_height,
-                  int stride_width, int padding_height, int padding_width);
+                  int stride_width, int padding_up, int padding_down,
+                  int padding_left, int padding_right);
 };
 
 }  // namespace math
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index f0b8c885918afe7f80edc465c6d9be7c11ac066f..5763782c4edec87f44dabef2ccffe3097eeb2421 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -35,6 +35,12 @@ void testIm2col() {
    *
    * output_ocf = [0, 1, 3, 4
    *               1, 2, 4, 5]
+   *
+   * col2im_cfo = [0, 2, 2
+   *               3, 4, 5]
+   *
+   * col2im_ocf = [0, 2, 2
+   *               3, 4, 5]
    */
   int input_height = 2;
   int input_width = 3;
@@ -49,16 +55,29 @@ void testIm2col() {
   memcpy(input_ptr, arr, 6 * sizeof(float));
 
   auto* place = new Place();
+  paddle::platform::DeviceContext* context;
+  if (paddle::platform::is_cpu_place(*place)) {
+    context =
+        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    context =
+        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
+#else
+    PADDLE_THROW("no GPU support");
+#endif  // PADDLE_WITH_CUDA
+  }
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom<float>(input_tmp, *place);
+    input.CopyFrom(input_tmp, *place, *context);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
   output_ocf.mutable_data<float>(
       {output_height, output_width, 1, filter_size, filter_size}, *place);
 
+  // Im2Col
   paddle::operators::math::Im2ColFunctor<
       paddle::operators::math::ColFormat::kCFO, Place, float>
       im2col;
@@ -66,57 +85,91 @@ void testIm2col() {
       paddle::operators::math::ColFormat::kOCF, Place, float>
       im2col_ocf;
 
-  paddle::platform::DeviceContext* context;
-  if (paddle::platform::is_cpu_place(*place)) {
-    context =
-        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
-  } else {
-#ifndef PADDLE_ONLY_CPU
-    context =
-        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
-#else
-    PADDLE_THROW("no GPU support");
-#endif  // PADDLE_ONLY_CPU
-  }
-  im2col(*context, input, output_cfo, stride, stride, padding, padding);
-  im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding);
+  im2col(*context, input, output_cfo, stride, stride, padding, padding, padding,
+         padding);
+  im2col_ocf(*context, input, output_ocf, stride, stride, padding, padding,
+             padding, padding);
+
+  float out_cfo_data[] = {0, 1, 1, 2, 3, 4, 4, 5};
+  float out_ocf_data[] = {0, 1, 3, 4, 1, 2, 4, 5};
 
   float* out_cfo_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    output_tmp.CopyFrom<float>(output_cfo, paddle::platform::CPUPlace());
+    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
     out_cfo_ptr = output_tmp.data<float>();
   }
-  EXPECT_EQ(out_cfo_ptr[0], 0);
-  EXPECT_EQ(out_cfo_ptr[1], 1);
-  EXPECT_EQ(out_cfo_ptr[2], 1);
-  EXPECT_EQ(out_cfo_ptr[3], 2);
-  EXPECT_EQ(out_cfo_ptr[4], 3);
-  EXPECT_EQ(out_cfo_ptr[5], 4);
-  EXPECT_EQ(out_cfo_ptr[6], 4);
-  EXPECT_EQ(out_cfo_ptr[7], 5);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], out_cfo_data[i]);
+  }
 
   float* out_ocf_ptr;
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    output_tmp.CopyFrom<float>(output_ocf, paddle::platform::CPUPlace());
+    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
     out_ocf_ptr = output_tmp.data<float>();
   }
-  EXPECT_EQ(out_ocf_ptr[0], 0);
-  EXPECT_EQ(out_ocf_ptr[1], 1);
-  EXPECT_EQ(out_ocf_ptr[2], 3);
-  EXPECT_EQ(out_ocf_ptr[3], 4);
-  EXPECT_EQ(out_ocf_ptr[4], 1);
-  EXPECT_EQ(out_ocf_ptr[5], 2);
-  EXPECT_EQ(out_ocf_ptr[6], 4);
-  EXPECT_EQ(out_ocf_ptr[7], 5);
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
+  }
+
+  // Col2Im: kCFO
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kCFO, Place, float>
+      col2im;
+  paddle::operators::math::Col2ImFunctor<
+      paddle::operators::math::ColFormat::kOCF, Place, float>
+      col2im_ocf;
+  float col2im_data[] = {0, 2, 2, 3, 8, 5};
+
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom(input_tmp, *place, *context);
+  }
+
+  col2im(*context, input, output_cfo, stride, stride, padding, padding, padding,
+         padding);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
+
+  // Col2Im: kOCF
+  memset(input_ptr, 0, 6 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom(input_tmp, *place, *context);
+  }
+
+  col2im_ocf(*context, input, output_ocf, stride, stride, padding, padding,
+             padding, padding);
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    in_ptr = input_tmp.data<float>();
+  }
+  for (int i = 0; i < 6; ++i) {
+    EXPECT_EQ(in_ptr[i], col2im_data[i]);
+  }
 }
 
 TEST(math, im2col) {
   testIm2col<paddle::platform::CPUPlace>();
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   testIm2col<paddle::platform::GPUPlace>();
 #endif
 }
diff --git a/paddle/operators/math/lstm_compute.cc b/paddle/operators/math/lstm_compute.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0febf8e3b70111d12f858cf6259a2801a42d9a90
--- /dev/null
+++ b/paddle/operators/math/lstm_compute.cc
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/lstm_compute.h"
+#include "paddle/operators/math/detail/lstm_cpu_kernel.h"
+#include "paddle/operators/math/detail/lstm_kernel.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext& context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const std::string& gate_act, const std::string& cell_act,
+                      const std::string& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
+                               ActiveType(cand_act), ActiveType(gate_act),
+                               ActiveType(cell_act));
+      value.gateValue += frame_size * 4;
+      value.stateValue += frame_size;
+      value.stateActiveValue += frame_size;
+      value.outputValue += frame_size;
+      if (value.prevStateValue) {
+        value.prevStateValue += frame_size;
+      }
+    }
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<platform::CPUPlace, T> {
+  static void compute(const platform::DeviceContext& context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const std::string& gate_act, const std::string& cell_act,
+                      const std::string& cand_act) {
+    for (int b = 0; b < batch_size; b++) {
+      detail::cpu_lstm_backward(detail::backward::lstm<T>(), value, grad,
+                                frame_size, ActiveType(cand_act),
+                                ActiveType(gate_act), ActiveType(cell_act));
+
+      value.gateValue += frame_size * 4;
+      value.stateValue += frame_size;
+      value.stateActiveValue += frame_size;
+      value.outputValue += frame_size;
+      if (value.prevStateValue) {
+        value.prevStateValue += frame_size;
+      }
+
+      grad.gateGrad += frame_size * 4;
+      grad.stateGrad += frame_size;
+      grad.stateActiveGrad += frame_size;
+      grad.outputGrad += frame_size;
+      if (grad.prevStateGrad) {
+        grad.prevStateGrad += frame_size;
+      }
+    }
+  }
+};
+
+template class LstmUnitFunctor<platform::CPUPlace, float>;
+template class LstmUnitFunctor<platform::CPUPlace, double>;
+template class LstmUnitGradFunctor<platform::CPUPlace, float>;
+template class LstmUnitGradFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/lstm_compute.cu b/paddle/operators/math/lstm_compute.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b2122f2a5c08a6d9d53293833177f0ba2c3ab860
--- /dev/null
+++ b/paddle/operators/math/lstm_compute.cu
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/detail/lstm_gpu_kernel.h"
+#include "paddle/operators/math/detail/lstm_kernel.h"
+#include "paddle/operators/math/lstm_compute.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+struct LstmUnitFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext& context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const std::string& gate_act, const std::string& cell_act,
+                      const std::string& cand_act) {
+    detail::gpu_lstm_forward<T>(context, detail::forward::lstm<T>(), value,
+                                frame_size, batch_size, ActiveType(cand_act),
+                                ActiveType(gate_act), ActiveType(cell_act));
+  }
+};
+
+template <class T>
+struct LstmUnitGradFunctor<platform::GPUPlace, T> {
+  static void compute(const platform::DeviceContext& context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const std::string& gate_act, const std::string& cell_act,
+                      const std::string& cand_act) {
+    detail::gpu_lstm_backward(context, detail::backward::lstm<T>(), value, grad,
+                              frame_size, batch_size, ActiveType(cand_act),
+                              ActiveType(gate_act), ActiveType(cell_act));
+  }
+};
+
+template class LstmUnitFunctor<platform::GPUPlace, float>;
+template class LstmUnitFunctor<platform::GPUPlace, double>;
+template class LstmUnitGradFunctor<platform::GPUPlace, float>;
+template class LstmUnitGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/lstm_compute.h b/paddle/operators/math/lstm_compute.h
new file mode 100644
index 0000000000000000000000000000000000000000..28d2c6fd3b0d8143da90c37f241072e37397f98b
--- /dev/null
+++ b/paddle/operators/math/lstm_compute.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+typedef enum {
+  HL_ACTIVATION_SIGMOID = 0,
+  HL_ACTIVATION_RELU = 1,
+  HL_ACTIVATION_TANH = 2,
+  HL_ACTIVATION_LINEAR = 3,
+  HL_ACTIVATION_END
+} activation_mode_t;
+
+template <class T>
+struct LstmMetaValue {
+  T *gateValue;
+  T *prevStateValue;
+  T *stateValue;
+  T *stateActiveValue;
+  T *outputValue;
+  T *checkIg;
+  T *checkFg;
+  T *checkOg;
+};
+
+template <class T>
+struct LstmMetaGrad {
+  T *gateGrad;
+  T *prevStateGrad;
+  T *stateGrad;
+  T *stateActiveGrad;
+  T *outputGrad;
+  T *checkIgGrad;
+  T *checkFgGrad;
+  T *checkOgGrad;
+};
+
+inline activation_mode_t ActiveType(const std::string &type) {
+  if (type == "sigmoid") {
+    return HL_ACTIVATION_SIGMOID;
+  } else if (type == "relu") {
+    return HL_ACTIVATION_RELU;
+  } else if (type == "tanh") {
+    return HL_ACTIVATION_TANH;
+  } else if (type == "linear" || type == "identity" || type == "") {
+    return HL_ACTIVATION_LINEAR;
+  } else {
+    PADDLE_THROW("Do not support activation type.");
+  }
+}
+
+template <typename Place, typename T>
+class LstmUnitFunctor {
+ public:
+  static void compute(const platform::DeviceContext &context,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
+                      const std::string &gate_act, const std::string &cell_act,
+                      const std::string &cand_act);
+};
+
+template <typename Place, typename T>
+class LstmUnitGradFunctor {
+ public:
+  static void compute(const platform::DeviceContext &context,
+                      LstmMetaValue<T> value, LstmMetaGrad<T> grad,
+                      int frame_size, int batch_size,
+                      const std::string &gate_act, const std::string &cell_act,
+                      const std::string &cand_act);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/math_function.cc b/paddle/operators/math/math_function.cc
index ba653afa2cb175ae2e5e21088b6dc7ba76a6018f..aad1357598c629a4edfe0ad9b23f0241093a2522 100644
--- a/paddle/operators/math/math_function.cc
+++ b/paddle/operators/math/math_function.cc
@@ -130,6 +130,89 @@ void matmul<platform::CPUPlace, double>(
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
+#ifdef PADDLE_USE_MKLML
+// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
+template <>
+void batched_gemm<platform::CPUPlace, float>(
+    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const float*>(batchCount);
+  auto b_array = std::vector<const float*>(batchCount);
+  auto c_array = std::vector<float*>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+  cblas_sgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+}
+
+template <>
+void batched_gemm<platform::CPUPlace, double>(
+    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  auto a_array = std::vector<const double*>(batchCount);
+  auto b_array = std::vector<const double*>(batchCount);
+  auto c_array = std::vector<double*>(batchCount);
+  for (int k = 0; k < batchCount; ++k) {
+    a_array[k] = &A[k * strideA];
+    b_array[k] = &B[k * strideB];
+    c_array[k] = &C[k * M * N];
+  }
+  cblas_dgemm_batch(CblasRowMajor, &transA, &transB, &M, &N, &K, &alpha,
+                    a_array.data(), &lda, b_array.data(), &ldb, &beta,
+                    c_array.data(), &ldc, 1 /* group_count */, &batchCount);
+}
+#else
+// The below is a naive but correct serial implementation that just loops
+// over the batch dimension. This is a fallback for when the batched gemm
+// functions of Intel MKL are not available. In the future, this computation
+// should be parallelized.
+template <>
+void batched_gemm<platform::CPUPlace, float>(
+    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  for (int k = 0; k < batchCount; ++k) {
+    const float* Ak = &A[k * strideA];
+    const float* Bk = &B[k * strideB];
+    float* Ck = &C[k * M * N];
+    gemm<platform::CPUPlace, float>(context, transA, transB, M, N, K, alpha, Ak,
+                                    Bk, beta, Ck);
+  }
+}
+
+template <>
+void batched_gemm<platform::CPUPlace, double>(
+    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  for (int k = 0; k < batchCount; ++k) {
+    const double* Ak = &A[k * strideA];
+    const double* Bk = &B[k * strideB];
+    double* Ck = &C[k * M * N];
+    gemm<platform::CPUPlace, double>(context, transA, transB, M, N, K, alpha,
+                                     Ak, Bk, beta, Ck);
+  }
+}
+#endif
+
+template struct SetConstant<platform::CPUPlace, float>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 649f1f352c2a4a5ebaa0cb00ffb2e4de8aa4961a..5583683c6e12b88ba81015aef9161913de261ef2 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -155,6 +155,56 @@ void matmul<platform::GPUPlace, double>(
       matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
 
+template <>
+void batched_gemm<platform::GPUPlace, float>(
+    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const float alpha, const float* A, const float* B, const float beta,
+    float* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
+      &beta, C, ldc, strideC, batchCount));
+}
+
+template <>
+void batched_gemm<platform::GPUPlace, double>(
+    const platform::DeviceContext& context, const CBLAS_TRANSPOSE transA,
+    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+    const double alpha, const double* A, const double* B, const double beta,
+    double* C, const int batchCount, const int strideA, const int strideB) {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  const int strideC = M * N;
+
+  PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(
+      reinterpret_cast<const platform::CUDADeviceContext&>(context)
+          .cublas_handle(),
+      cuTransB, cuTransA, N, M, K, &alpha, B, ldb, strideB, A, lda, strideA,
+      &beta, C, ldc, strideC, batchCount));
+}
+
+template struct SetConstant<platform::GPUPlace, float>;
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index 43306fca73387b7b212f556a2b187df113a1b327..9777ebfd156709a370be2cb4ba0077ac7c6735fb 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -52,6 +52,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 
 #include <cmath>
 
+#include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
@@ -62,7 +63,7 @@ namespace math {
 
 // Support continuous memory now
 // If transA = N, and transB = N
-// Then matrixA: M * K, matrixB: K * N matrixC : M * N
+// Then matrixA: M * K, matrixB: K * N, matrixC : M * N
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
 template <typename Place, typename T>
@@ -84,6 +85,24 @@ void matmul(const platform::DeviceContext& context,
             const framework::Tensor& matrix_b, bool trans_b, T alpha,
             framework::Tensor* matrix_out, T beta);
 
+// Batched gemm
+template <typename Place, typename T>
+void batched_gemm(const platform::DeviceContext& context,
+                  const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+                  const int M, const int N, const int K, const T alpha,
+                  const T* A, const T* B, const T beta, T* C,
+                  const int batchCount, const int strideA, const int strideB);
+
+template <typename Place, typename T>
+struct SetConstant {
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor* tensor, T num) {
+    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    t.device(*context.GetEigenDevice<Place>()) =
+        t.constant(static_cast<T>(num));
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function_test.cc b/paddle/operators/math/math_function_test.cc
index f272f7e5135e7092618b8c94ee55faf1cfd8e8a5..3b9f92e7ae5f34dd0fb1ba8fb0c67ff5ae1628c4 100644
--- a/paddle/operators/math/math_function_test.cc
+++ b/paddle/operators/math/math_function_test.cc
@@ -1,181 +1,6 @@
 #include "paddle/operators/math/math_function.h"
 #include "gtest/gtest.h"
 
-#ifndef PADDLE_ONLY_CPU
-TEST(math_function, notrans_mul_trans) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place);
-
-  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
-
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
-      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
-
-  out.CopyFrom<float>(out_gpu, *cpu_place);
-
-  float* out_ptr = out.data<float>();
-  EXPECT_EQ(out_ptr[0], 5);
-  EXPECT_EQ(out_ptr[1], 14);
-  EXPECT_EQ(out_ptr[2], 14);
-  EXPECT_EQ(out_ptr[3], 50);
-  delete gpu_place;
-}
-
-TEST(math_function, trans_mul_notrans) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor out_gpu;
-  paddle::framework::Tensor out;
-
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr, 6 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
-  input2_gpu.CopyFrom<float>(input1, *gpu_place);
-
-  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
-
-  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
-      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
-
-  out.CopyFrom<float>(out_gpu, *cpu_place);
-
-  float* out_ptr = out.data<float>();
-  EXPECT_EQ(out_ptr[0], 9);
-  EXPECT_EQ(out_ptr[1], 12);
-  EXPECT_EQ(out_ptr[2], 15);
-  EXPECT_EQ(out_ptr[3], 12);
-  EXPECT_EQ(out_ptr[4], 17);
-  EXPECT_EQ(out_ptr[5], 22);
-  EXPECT_EQ(out_ptr[6], 15);
-  EXPECT_EQ(out_ptr[7], 22);
-  EXPECT_EQ(out_ptr[8], 29);
-  delete gpu_place;
-}
-
-TEST(math_function, gemm_notrans_cublas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
-  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
-
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
-      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
-
-  input3.CopyFrom<float>(input3_gpu, *cpu_place);
-
-  // numpy code:
-  // a = np.arange(6).reshape(2, 3)
-  // b = np.arange(12).reshape(3, 4)[:, 1:]
-  // c = np.arange(8).reshape(2, 4)[:, 1:]
-  // out = np.arange(8).reshape(2, 4)
-  // out[:, 1:] = np.dot(a, b) + c
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
-}
-
-TEST(math_function, gemm_trans_cublas) {
-  paddle::framework::Tensor input1;
-  paddle::framework::Tensor input2;
-  paddle::framework::Tensor input3;
-  paddle::framework::Tensor input1_gpu;
-  paddle::framework::Tensor input2_gpu;
-  paddle::framework::Tensor input3_gpu;
-
-  int m = 2;
-  int n = 3;
-  int k = 3;
-  auto* cpu_place = new paddle::platform::CPUPlace();
-  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
-  float arr1[6] = {0, 1, 2, 3, 4, 5};
-  memcpy(input1_ptr, arr1, 6 * sizeof(float));
-  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
-  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
-  memcpy(input2_ptr, arr2, 12 * sizeof(float));
-  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
-  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
-  memcpy(input3_ptr, arr3, 8 * sizeof(float));
-
-  auto* gpu_place = new paddle::platform::GPUPlace(0);
-  paddle::platform::CUDADeviceContext context(*gpu_place);
-
-  input1_gpu.CopyFrom<float>(input1, *gpu_place);
-  input2_gpu.CopyFrom<float>(input2, *gpu_place);
-  input3_gpu.CopyFrom<float>(input3, *gpu_place);
-  float* a = input1_gpu.data<float>();
-  float* b = input2_gpu.data<float>();
-  float* c = input3_gpu.mutable_data<float>(*gpu_place);
-
-  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
-      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
-
-  input3.CopyFrom<float>(input3_gpu, *cpu_place);
-
-  EXPECT_EQ(input3_ptr[0], 0);
-  EXPECT_EQ(input3_ptr[1], 24);
-  EXPECT_EQ(input3_ptr[2], 28);
-  EXPECT_EQ(input3_ptr[3], 32);
-  EXPECT_EQ(input3_ptr[4], 4);
-  EXPECT_EQ(input3_ptr[5], 73);
-  EXPECT_EQ(input3_ptr[6], 86);
-  EXPECT_EQ(input3_ptr[7], 99);
-  delete gpu_place;
-}
-#endif
-
 TEST(math_function, gemm_notrans_cblas) {
   paddle::framework::Tensor input1;
   paddle::framework::Tensor input2;
@@ -243,3 +68,24 @@ TEST(math_function, gemm_trans_clbas) {
   EXPECT_EQ(input3_ptr[6], 86);
   EXPECT_EQ(input3_ptr[7], 99);
 }
+
+TEST(math_function, zero) {
+  paddle::framework::Tensor tensor;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
+  paddle::platform::CPUDeviceContext context(*cpu_place);
+  paddle::operators::math::SetConstant<paddle::platform::CPUPlace, float>
+      functor;
+  functor(context, &tensor, 0);
+  EXPECT_EQ(t[0], 0);
+  EXPECT_EQ(t[1], 0);
+  EXPECT_EQ(t[2], 0);
+  EXPECT_EQ(t[3], 0);
+
+  functor(context, &tensor, 1);
+
+  EXPECT_EQ(t[0], 1);
+  EXPECT_EQ(t[1], 1);
+  EXPECT_EQ(t[2], 1);
+  EXPECT_EQ(t[3], 1);
+}
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8b22c71552a65044cbd02441fb35c1eafe0173dc
--- /dev/null
+++ b/paddle/operators/math/math_function_test.cu
@@ -0,0 +1,179 @@
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+
+TEST(math_function, notrans_mul_trans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input1, *gpu_place, context);
+
+  out_gpu.mutable_data<float>({2, 2}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+      context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
+
+  out.CopyFrom(out_gpu, *cpu_place, context);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 5);
+  EXPECT_EQ(out_ptr[1], 14);
+  EXPECT_EQ(out_ptr[2], 14);
+  EXPECT_EQ(out_ptr[3], 50);
+  delete gpu_place;
+}
+
+TEST(math_function, trans_mul_notrans) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor out_gpu;
+  paddle::framework::Tensor out;
+
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr, 6 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input1, *gpu_place, context);
+
+  out_gpu.mutable_data<float>({3, 3}, *gpu_place);
+
+  paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
+      context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
+
+  out.CopyFrom(out_gpu, *cpu_place, context);
+
+  float* out_ptr = out.data<float>();
+  context.Wait();
+  EXPECT_EQ(out_ptr[0], 9);
+  EXPECT_EQ(out_ptr[1], 12);
+  EXPECT_EQ(out_ptr[2], 15);
+  EXPECT_EQ(out_ptr[3], 12);
+  EXPECT_EQ(out_ptr[4], 17);
+  EXPECT_EQ(out_ptr[5], 22);
+  EXPECT_EQ(out_ptr[6], 15);
+  EXPECT_EQ(out_ptr[7], 22);
+  EXPECT_EQ(out_ptr[8], 29);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_notrans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
+  float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input2, *gpu_place, context);
+  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
+
+  input3.CopyFrom(input3_gpu, *cpu_place, context);
+
+  // numpy code:
+  // a = np.arange(6).reshape(2, 3)
+  // b = np.arange(12).reshape(3, 4)[:, 1:]
+  // c = np.arange(8).reshape(2, 4)[:, 1:]
+  // out = np.arange(8).reshape(2, 4)
+  // out[:, 1:] = np.dot(a, b) + c
+  context.Wait();
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
+
+TEST(math_function, gemm_trans_cublas) {
+  paddle::framework::Tensor input1;
+  paddle::framework::Tensor input2;
+  paddle::framework::Tensor input3;
+  paddle::framework::Tensor input1_gpu;
+  paddle::framework::Tensor input2_gpu;
+  paddle::framework::Tensor input3_gpu;
+
+  int m = 2;
+  int n = 3;
+  int k = 3;
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
+  float arr1[6] = {0, 1, 2, 3, 4, 5};
+  memcpy(input1_ptr, arr1, 6 * sizeof(float));
+  float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
+  float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+  memcpy(input2_ptr, arr2, 12 * sizeof(float));
+  float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
+  float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
+  memcpy(input3_ptr, arr3, 8 * sizeof(float));
+
+  auto* gpu_place = new paddle::platform::GPUPlace(0);
+  paddle::platform::CUDADeviceContext context(*gpu_place);
+
+  input1_gpu.CopyFrom(input1, *gpu_place, context);
+  input2_gpu.CopyFrom(input2, *gpu_place, context);
+  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  float* a = input1_gpu.data<float>();
+  float* b = input2_gpu.data<float>();
+  float* c = input3_gpu.mutable_data<float>(*gpu_place);
+
+  paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
+      context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
+
+  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  context.Wait();
+
+  EXPECT_EQ(input3_ptr[0], 0);
+  EXPECT_EQ(input3_ptr[1], 24);
+  EXPECT_EQ(input3_ptr[2], 28);
+  EXPECT_EQ(input3_ptr[3], 32);
+  EXPECT_EQ(input3_ptr[4], 4);
+  EXPECT_EQ(input3_ptr[5], 73);
+  EXPECT_EQ(input3_ptr[6], 86);
+  EXPECT_EQ(input3_ptr[7], 99);
+  delete gpu_place;
+}
diff --git a/paddle/operators/math/matmul.h b/paddle/operators/math/matmul.h
new file mode 100644
index 0000000000000000000000000000000000000000..6ba9a0ba9a70bd938f9362179990ab68fa3186ba
--- /dev/null
+++ b/paddle/operators/math/matmul.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// Implements the logic of numpy matmul:
+// https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.matmul.html
+//
+// but allowing also for a, b to be transposed
+//
+// Both a & b can be 1- to 3-dimensional. Higher rank tensors are not supported
+// yet.
+template <typename Place, typename T>
+class MatMulFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& a, bool trans_a,
+                  const framework::Tensor& b, bool trans_b, T alpha,
+                  framework::Tensor* out, T beta) {
+    auto dim_a = a.dims();
+    auto dim_b = b.dims();
+
+    PADDLE_ENFORCE(a.place() == b.place() && b.place() == out->place(),
+                   "Tensors must all be in the same place.");
+    PADDLE_ENFORCE_GE(dim_a.size(), 1,
+                      "Input tensor a must be at least 1-dimensional.");
+    PADDLE_ENFORCE_GE(dim_b.size(), 1,
+                      "Input tensor b must be at least 1-dimensional.");
+    PADDLE_ENFORCE_LE(dim_a.size(), 3,
+                      "Input tensor a must be at most 3-dimensional.");
+    PADDLE_ENFORCE_LE(dim_b.size(), 3,
+                      "Input tensor b must be at most 3-dimensional.");
+
+    int M = 0, N = 0, kA = 0, kB = 0, batchCountA = 0, batchCountB = 0,
+        strideA = 0, strideB = 0;
+
+    switch (dim_a.size()) {
+      case 1:
+        // similar to np.matmul:
+        // prepend dimension 1 (no transpose) or append dimension 1 (transpose)
+        M = trans_a ? dim_a[0] : 1;
+        kA = trans_a ? 1 : dim_a[0];
+        break;
+      case 2:
+        M = trans_a ? dim_a[1] : dim_a[0];
+        kA = trans_a ? dim_a[0] : dim_a[1];
+        break;
+      case 3:
+        batchCountA = dim_a[0];
+        M = trans_a ? dim_a[2] : dim_a[1];
+        kA = trans_a ? dim_a[1] : dim_a[2];
+        strideA = M * kA;
+        break;
+      default:
+        assert(false);
+    }
+
+    switch (dim_b.size()) {
+      case 1:
+        // similar to np.matmul:
+        // append dimension 1 (no transpose) or prepend dimension 1 (transpose)
+        kB = trans_b ? 1 : dim_b[0];
+        N = trans_b ? dim_b[0] : 1;
+        break;
+      case 2:
+        kB = trans_b ? dim_b[1] : dim_b[0];
+        N = trans_b ? dim_b[0] : dim_b[1];
+        break;
+      case 3:
+        batchCountB = dim_b[0];
+        kB = trans_b ? dim_b[2] : dim_b[1];
+        N = trans_b ? dim_b[1] : dim_b[2];
+        strideB = kB * N;
+        break;
+      default:
+        assert(false);
+    }
+
+    PADDLE_ENFORCE_EQ(
+        kA, kB,
+        "First matrix's width must be equal with second matrix's height.");
+    if (batchCountA && batchCountB) {
+      PADDLE_ENFORCE_EQ(
+          batchCountA, batchCountB,
+          "When input tensors a and b are both batched, they must have the "
+          "same batch dimension.");
+    }
+    int batchCount = std::max(batchCountA, batchCountB);
+
+    CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
+    CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
+
+    if (!batchCount) {
+      // regular matrix multiplication
+      gemm<Place, T>(context, transA, transB, M, N, kA, alpha, a.data<T>(),
+                     b.data<T>(), beta, out->data<T>());
+    } else {
+      // batched matrix multiplication
+      batched_gemm<Place, T>(context, transA, transB, M, N, kA, alpha,
+                             a.data<T>(), b.data<T>(), beta, out->data<T>(),
+                             batchCount, strideA, strideB);
+    }
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cc b/paddle/operators/math/pooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50cfb88bb5700dda3785e63e0ccc6457cc928da0
--- /dev/null
+++ b/paddle/operators/math/pooling.cc
@@ -0,0 +1,740 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = pool_process.initial();
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_process.compute(ele, input_data[h * input_width + w]);
+              }
+            }
+            int pool_size = (hend - hstart) * (wend - wstart);
+            pool_process.finalize(ele, (static_cast<T>(pool_size)));
+            output_data[ph * output_width + pw] = ele;
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+* All tensors are in NCHW format.
+* Ksize, strides, paddings are two elements. These two elements represent height
+* and width, respectively.
+*/
+template <typename PoolProcess, class T>
+class Pool2dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+            int pool_size = (hend - hstart) * (wend - wstart);
+            float scale = 1.0 / pool_size;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                pool_grad_process.compute(
+                    input_data[h * input_width + w],
+                    output_data[ph * output_width + pw],
+                    output_grad_data[ph * output_width + pw],
+                    input_grad_data[h * input_width + w],
+                    static_cast<T>(scale));
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <class T>
+class MaxPool2dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            bool stop = false;
+            for (int h = hstart; h < hend && !stop; ++h) {
+              for (int w = wstart; w < wend && !stop; ++w) {
+                int input_idx = h * input_width + w;
+                int output_idx = ph * output_width + pw;
+                if (input_data[input_idx] == output_data[output_idx]) {
+                  input_grad_data[input_idx] += output_grad_data[output_idx];
+                  stop = true;
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::CPUPlace, float>;
+template class MaxPool2dGradFunctor<platform::CPUPlace, double>;
+
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = pool_process.initial();
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    pool_process.compute(
+                        ele,
+                        input_data[(d * input_height + h) * input_width + w]);
+                  }
+                }
+              }
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              pool_process.finalize(ele, static_cast<T>(pool_size));
+              output_data[output_idx] = ele;
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::CPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_grad_process) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int pool_size =
+                  (dend - dstart) * (hend - hstart) * (wend - wstart);
+              float scale = 1.0 / pool_size;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+                    pool_grad_process.compute(
+                        input_data[input_idx], output_data[output_idx],
+                        output_grad_data[output_idx],
+                        input_grad_data[input_idx], static_cast<T>(scale));
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+              bool stop = false;
+              for (int d = dstart; d < dend && !stop; ++d) {
+                for (int h = hstart; h < hend && !stop; ++h) {
+                  for (int w = wstart; w < wend && !stop; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    int output_idx =
+                        (pd * output_height + ph) * output_width + pw;
+
+                    if (input_data[input_idx] == output_data[output_idx]) {
+                      input_grad_data[input_idx] +=
+                          output_grad_data[output_idx];
+                      stop = true;
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        input_data += input_stride;
+        output_data += output_stride;
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::CPUPlace, float>;
+template class MaxPool3dGradFunctor<platform::CPUPlace, double>;
+
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::CPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::CPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dWithIndexFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          int hstart = ph * stride_height - padding_height;
+          int hend = std::min(hstart + ksize_height, input_height);
+          hstart = std::max(hstart, 0);
+          for (int pw = 0; pw < output_width; ++pw) {
+            int wstart = pw * stride_width - padding_width;
+            int wend = std::min(wstart + ksize_width, input_width);
+            wstart = std::max(wstart, 0);
+
+            T ele = static_cast<T>(-FLT_MAX);
+            int index = -1;
+            for (int h = hstart; h < hend; ++h) {
+              for (int w = wstart; w < wend; ++w) {
+                if (ele < input_data[h * input_width + w]) {
+                  ele = input_data[h * input_width + w];
+                  index = h * input_width + w;
+                }
+              }
+            }
+            output_data[ph * output_width + pw] = ele;
+            mask_data[ph * output_width + pw] = index;
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input_grad.dims()[0];
+    const int input_height = input_grad.dims()[2];
+    const int input_width = input_grad.dims()[3];
+    const int output_channels = output_grad.dims()[1];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int input_stride = input_height * input_width;
+    const int output_stride = output_height * output_width;
+
+    const T* mask_data = mask.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int ph = 0; ph < output_height; ++ph) {
+          for (int pw = 0; pw < output_width; ++pw) {
+            const int output_idx = ph * output_width + pw;
+            const int input_idx = static_cast<int>(mask_data[output_idx]);
+            input_grad_data[input_idx] += output_grad_data[output_idx];
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, float>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, float>;
+template class MaxPool2dWithIndexFunctor<platform::CPUPlace, double>;
+template class MaxPool2dWithIndexGradFunctor<platform::CPUPlace, double>;
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T>
+class MaxPool3dWithIndexFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+
+    for (int i = 0; i < batch_size; i++) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          int dstart = pd * stride_depth - padding_depth;
+          int dend = std::min(dstart + ksize_depth, input_depth);
+          dstart = std::max(dstart, 0);
+          for (int ph = 0; ph < output_height; ++ph) {
+            int hstart = ph * stride_height - padding_height;
+            int hend = std::min(hstart + ksize_height, input_height);
+            hstart = std::max(hstart, 0);
+            for (int pw = 0; pw < output_width; ++pw) {
+              int wstart = pw * stride_width - padding_width;
+              int wend = std::min(wstart + ksize_width, input_width);
+              wstart = std::max(wstart, 0);
+
+              int output_idx = (pd * output_height + ph) * output_width + pw;
+              T ele = static_cast<T>(-FLT_MAX);
+              int index = -1;
+              for (int d = dstart; d < dend; ++d) {
+                for (int h = hstart; h < hend; ++h) {
+                  for (int w = wstart; w < wend; ++w) {
+                    int input_idx = (d * input_height + h) * input_width + w;
+                    if (ele < input_data[input_idx]) {
+                      index = input_idx;
+                      ele = input_data[input_idx];
+                    }
+                  }
+                }
+              }
+              output_data[output_idx] = ele;
+              mask_data[output_idx] = index;
+            }
+          }
+        }
+        // offset
+        input_data += input_stride;
+        output_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T>
+class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input_grad.dims()[0];
+    const int input_depth = input_grad.dims()[2];
+    const int input_height = input_grad.dims()[3];
+    const int input_width = input_grad.dims()[4];
+    const int output_channels = output_grad.dims()[1];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int input_stride = input_depth * input_height * input_width;
+    const int output_stride = output_depth * output_height * output_width;
+
+    const T* mask_data = mask.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    for (int n = 0; n < batch_size; ++n) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int pd = 0; pd < output_depth; ++pd) {
+          for (int ph = 0; ph < output_height; ++ph) {
+            for (int pw = 0; pw < output_width; ++pw) {
+              const int output_idx =
+                  (pd * output_height + ph) * output_width + pw;
+              const int input_idx = static_cast<int>(mask_data[output_idx]);
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+            }
+          }
+        }
+        // offset
+        input_grad_data += input_stride;
+        output_grad_data += output_stride;
+        mask_data += output_stride;
+      }
+    }
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, float>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, float>;
+template class MaxPool3dWithIndexFunctor<platform::CPUPlace, double>;
+template class MaxPool3dWithIndexGradFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.cu b/paddle/operators/math/pooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..736327f4b7b9e9df9ce8f7f60b0437fc1d2d373a
--- /dev/null
+++ b/paddle/operators/math/pooling.cu
@@ -0,0 +1,1059 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/pooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2D(const int nthreads, const T* input_data,
+                             T* output_data, const int channels,
+                             const int input_height, const int input_width,
+                             const int output_height, const int output_width,
+                             const int ksize_height, const int ksize_width,
+                             const int stride_height, const int stride_width,
+                             const int padding_height, const int padding_width,
+                             PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = pool_process.initial();
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        pool_process.compute(ele, input_data[h * input_width + w]);
+      }
+    }
+    int pool_size = (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, (static_cast<T>(pool_size)));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width, PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetC = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int phend = min(offsetH / stride_height + 1, output_height);
+    int pwend = min(offsetW / stride_width + 1, output_width);
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx =
+        (batch_idx * channels + offsetC) * output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = phstart; ph < phend; ++ph) {
+      for (int pw = pwstart; pw < pwend; ++pw) {
+        int hstart = ph * stride_height - padding_height;
+        int wstart = pw * stride_width - padding_width;
+        int hend = min(hstart + ksize_height, input_height);
+        int wend = min(wstart + ksize_width, input_width);
+        hstart = max(hstart, 0);
+        wstart = max(wstart, 0);
+        int pool_size = (hend - hstart) * (wend - wstart);
+        int output_sub_idx = ph * output_width + pw;
+        pool_process.compute(input, output_data[output_sub_idx],
+                             output_grad[output_sub_idx], gradient,
+                             static_cast<T>(1.0 / pool_size));
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool2DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_height, const int input_width, const int output_height,
+    const int output_width, const int ksize_height, const int ksize_width,
+    const int stride_height, const int stride_width, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    input_grad += (batch_idx * channels + c) * input_height * input_width;
+
+    T ele = output_data[index];
+    int maxIndex = -1;
+    bool stop = false;
+    for (int h = hstart; h < hend && !stop; ++h) {
+      for (int w = wstart; w < wend && !stop; ++w) {
+        if (ele == input_data[h * input_width + w]) {
+          maxIndex = h * input_width + w;
+          stop = true;
+        }
+      }
+    }
+
+    if (maxIndex != -1) {
+      // atomic add
+      platform::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
+    }
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data, input_channels,
+                              input_height, input_width, output_height,
+                              output_width, ksize_height, ksize_width,
+                              stride_height, stride_width, padding_height,
+                              padding_width, pool_process);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename PoolProcess, typename T>
+class Pool2dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool2DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width, pool_process);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_height, input_width, output_height, output_width,
+        ksize_height, ksize_width, stride_height, stride_width, padding_height,
+        padding_width);
+  }
+};
+
+template class MaxPool2dGradFunctor<platform::GPUPlace, float>;
+template class MaxPool2dGradFunctor<platform::GPUPlace, double>;
+
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool2dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool2dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3D(
+    const int nthreads, const T* input_data, T* output_data, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = pool_process.initial();
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          pool_process.compute(
+              ele, input_data[(d * input_height + h) * input_width + w]);
+        }
+      }
+    }
+    int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+    pool_process.finalize(ele, static_cast<T>(pool_size));
+    output_data[index] = ele;
+  }
+}
+
+template <typename PoolProcess, typename T>
+__global__ void KernelPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height, const int padding_width,
+    PoolProcess pool_process) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int offsetW = index % input_width + padding_width;
+    int offsetH = (index / input_width) % input_height + padding_height;
+    int offsetD =
+        (index / input_width / input_height) % input_depth + padding_depth;
+    int offsetC = (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pdstart = (offsetD < ksize_depth)
+                      ? 0
+                      : (offsetD - ksize_depth) / stride_depth + 1;
+    int phstart = (offsetH < ksize_height)
+                      ? 0
+                      : (offsetH - ksize_height) / stride_height + 1;
+    int pwstart = (offsetW < ksize_width)
+                      ? 0
+                      : (offsetW - ksize_width) / stride_width + 1;
+    int pdend = min((offsetD) / stride_depth + 1, output_depth);
+    int phend = min((offsetH) / stride_height + 1, output_height);
+    int pwend = min((offsetW) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    T input = input_data[index];
+    int output_idx = (batch_idx * channels + offsetC) * output_depth *
+                     output_height * output_width;
+    output_data += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pdstart; pd < pdend; ++pd) {
+      for (int ph = phstart; ph < phend; ++ph) {
+        for (int pw = pwstart; pw < pwend; ++pw) {
+          // figure out the pooling size
+          int dstart = pd * stride_depth - padding_depth;
+          int hstart = ph * stride_height - padding_height;
+          int wstart = pw * stride_width - padding_width;
+          int dend = min(dstart + ksize_depth, input_depth);
+          int hend = min(hstart + ksize_height, input_height);
+          int wend = min(wstart + ksize_width, input_width);
+          dstart = max(dstart, 0);
+          hstart = max(hstart, 0);
+          wstart = max(wstart, 0);
+          int pool_size = (dend - dstart) * (hend - hstart) * (wend - wstart);
+          int output_sub_idx = (pd * output_height + ph) * output_width + pw;
+          pool_process.compute(input, output_data[output_sub_idx],
+                               output_grad[output_sub_idx], gradient,
+                               static_cast<T>(1.0 / pool_size));
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool3DGrad(
+    const int nthreads, const T* input_data, const T* output_data,
+    const T* output_grad, T* input_grad, const int channels,
+    const int input_depth, const int input_height, const int input_width,
+    const int output_depth, const int output_height, const int output_width,
+    const int ksize_depth, const int ksize_height, const int ksize_width,
+    const int stride_depth, const int stride_height, const int stride_width,
+    const int padding_depth, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+    T ele = output_data[index];
+    bool stop = false;
+    int maxIdx = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+    input_grad +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend && !stop; ++d) {
+      for (int h = hstart; h < hend && !stop; ++h) {
+        for (int w = wstart; w < wend && !stop; ++w) {
+          if (ele == input_data[(d * input_height + h) * input_width + w]) {
+            stop = true;
+            maxIdx = (d * input_height + h) * input_width + w;
+          }
+        }
+      }
+    }
+    if (maxIdx != -1) {
+      // atomic add
+      platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
+    }
+  }
+}
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3D<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, input_channels, input_depth,
+        input_height, input_width, output_depth, output_height, output_width,
+        ksize_depth, ksize_height, ksize_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        pool_process);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename PoolProcess, class T>
+class Pool3dGradFunctor<platform::GPUPlace, PoolProcess, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_process) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelPool3DGrad<
+        PoolProcess,
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width, pool_process);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <class T>
+class MaxPool3dGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, output_grad_data, input_grad_data,
+        input_channels, input_depth, input_height, input_width, output_depth,
+        output_height, output_width, ksize_depth, ksize_height, ksize_width,
+        stride_depth, stride_height, stride_width, padding_depth,
+        padding_height, padding_width);
+  }
+};
+
+template class MaxPool3dGradFunctor<platform::GPUPlace, float>;
+template class MaxPool3dGradFunctor<platform::GPUPlace, double>;
+
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<float>, float>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<float>, float>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::MaxPool<double>, double>;
+template class Pool3dFunctor<platform::GPUPlace,
+                             paddle::operators::math::AvgPool<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::MaxPoolGrad<double>, double>;
+template class Pool3dGradFunctor<
+    platform::GPUPlace, paddle::operators::math::AvgPoolGrad<double>, double>;
+
+template <typename T>
+__global__ void KernelMaxPool2dWithIdx(
+    const int nthreads, const T* input_data, T* output_data, T* mask_data,
+    const int channels, const int input_height, const int input_width,
+    const int output_height, const int output_width, const int ksize_height,
+    const int ksize_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int c = (index / output_width / output_height) % channels;
+    int batch_idx = index / output_width / output_height / channels;
+
+    int hstart = ph * stride_height - padding_height;
+    int hend = min(hstart + ksize_height, input_height);
+    hstart = max(hstart, 0);
+
+    int wstart = pw * stride_width - padding_width;
+    int wend = min(wstart + ksize_width, input_width);
+    wstart = max(wstart, 0);
+
+    input_data += (batch_idx * channels + c) * input_height * input_width;
+    T ele = -FLT_MAX;
+    int max_index = -1;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_index = h * input_width + w;
+        if (ele < input_data[input_index]) {
+          max_index = input_index;
+          ele = input_data[input_index];
+        }
+      }
+    }
+    output_data[index] = ele;
+    mask_data[index] = max_index;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool2DWithIdxGrad(
+    const int nthreads, T* input_grad, const T* output_grad, const T* mask_data,
+    const int channels, const int input_height, const int input_width,
+    const int output_height, const int output_width, const int ksize_height,
+    const int ksize_width, const int stride_height, const int stride_width,
+    const int padding_height, const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int c_offset = (index / input_width / input_height) % channels;
+    int batch_idx = index / input_width / input_height / channels;
+
+    int ph_start =
+        (h_offset + padding_height < ksize_height)
+            ? 0
+            : (h_offset + padding_height - ksize_height) / stride_height + 1;
+    int pw_start =
+        (w_offset + padding_width < ksize_width)
+            ? 0
+            : (w_offset + padding_width - ksize_width) / stride_width + 1;
+    int ph_end =
+        min((h_offset + padding_height) / stride_height + 1, output_height);
+    int pw_end =
+        min((w_offset + padding_width) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    int input_current_featuremap_idx = h_offset * input_width + w_offset;
+    int output_idx =
+        (batch_idx * channels + c_offset) * output_height * output_width;
+
+    mask_data += output_idx;
+    output_grad += output_idx;
+    for (int ph = ph_start; ph < ph_end; ++ph) {
+      for (int pw = pw_start; pw < pw_end; ++pw) {
+        if (mask_data[ph * output_width + pw] == input_current_featuremap_idx)
+          gradient += output_grad[ph * output_width + pw];
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dWithIndexFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_height * output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2dWithIdx<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_data, output_data, mask_data,
+                              input_channels, input_height, input_width,
+                              output_height, output_width, ksize_height,
+                              ksize_width, stride_height, stride_width,
+                              padding_height, padding_width);
+  }
+};
+
+/*
+ * All tensors are in NCHW format.
+ * Ksize, strides, paddings are two elements. These two elements represent
+ * height and width, respectively.
+ */
+template <typename T>
+class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input_grad.dims()[0];
+    const int input_channels = input_grad.dims()[1];
+    const int input_height = input_grad.dims()[2];
+    const int input_width = input_grad.dims()[3];
+    const int output_height = output_grad.dims()[2];
+    const int output_width = output_grad.dims()[3];
+    const int ksize_height = ksize[0];
+    const int ksize_width = ksize[1];
+    const int stride_height = strides[0];
+    const int stride_width = strides[1];
+    const int padding_height = paddings[0];
+    const int padding_width = paddings[1];
+
+    const T* mask_data = mask.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * input_channels * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool2DWithIdxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(nthreads, input_grad_data, output_grad_data,
+                              mask_data, input_channels, input_height,
+                              input_width, output_height, output_width,
+                              ksize_height, ksize_width, stride_height,
+                              stride_width, padding_height, padding_width);
+  }
+};
+
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, float>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, float>;
+template class MaxPool2dWithIndexFunctor<platform::GPUPlace, double>;
+template class MaxPool2dWithIndexGradFunctor<platform::GPUPlace, double>;
+
+template <typename T>
+__global__ void KernelMaxPool3DWithIdx(
+    const int nthreads, const T* input_data, T* output_data, T* mask_data,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int pw = index % output_width;
+    int ph = (index / output_width) % output_height;
+    int pd = (index / output_width / output_height) % output_depth;
+    int c = (index / output_width / output_height / output_depth) % channels;
+    int batch_idx =
+        index / output_width / output_height / output_depth / channels;
+
+    int dstart = pd * stride_depth - padding_depth;
+    int hstart = ph * stride_height - padding_height;
+    int wstart = pw * stride_width - padding_width;
+    int dend = min(dstart + ksize_depth, input_depth);
+    int hend = min(hstart + ksize_height, input_height);
+    int wend = min(wstart + ksize_width, input_width);
+    dstart = max(dstart, 0);
+    hstart = max(hstart, 0);
+    wstart = max(wstart, 0);
+
+    T ele = -FLT_MAX;
+    int max_index = -1;
+    input_data +=
+        (batch_idx * channels + c) * input_depth * input_height * input_width;
+
+    for (int d = dstart; d < dend; ++d) {
+      for (int h = hstart; h < hend; ++h) {
+        for (int w = wstart; w < wend; ++w) {
+          if (ele < input_data[(d * input_height + h) * input_width + w]) {
+            max_index = (d * input_height + h) * input_width + w;
+            ele = input_data[max_index];
+          }
+        }
+      }
+    }
+    output_data[index] = ele;
+    mask_data[index] = max_index;
+  }
+}
+
+template <typename T>
+__global__ void KernelMaxPool3DWithIdxGrad(
+    const int nthreads, T* input_grad, const T* output_grad, const T* mask,
+    const int channels, const int input_depth, const int input_height,
+    const int input_width, const int output_depth, const int output_height,
+    const int output_width, const int ksize_depth, const int ksize_height,
+    const int ksize_width, const int stride_depth, const int stride_height,
+    const int stride_width, const int padding_depth, const int padding_height,
+    const int padding_width) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads;
+       index += blockDim.x * gridDim.x) {
+    int w_offset = index % input_width;
+    int h_offset = (index / input_width) % input_height;
+    int d_offset = (index / input_width / input_height) % input_depth;
+    int c_offset =
+        (index / input_width / input_height / input_depth) % channels;
+    int batch_idx = index / input_width / input_height / input_depth / channels;
+
+    int pd_start =
+        (d_offset + padding_depth < ksize_depth)
+            ? 0
+            : (d_offset + padding_depth - ksize_depth) / stride_depth + 1;
+    int ph_start =
+        (h_offset + padding_height < ksize_height)
+            ? 0
+            : (h_offset + padding_height - ksize_height) / stride_height + 1;
+    int pw_start =
+        (w_offset + padding_width < ksize_width)
+            ? 0
+            : (w_offset + padding_width - ksize_width) / stride_width + 1;
+    int pd_end =
+        min((d_offset + padding_depth) / stride_depth + 1, output_depth);
+    int ph_end =
+        min((h_offset + padding_height) / stride_height + 1, output_height);
+    int pw_end =
+        min((w_offset + padding_width) / stride_width + 1, output_width);
+
+    T gradient = 0;
+    int input_current_feature_map_idx =
+        (d_offset * input_height + h_offset) * input_width + w_offset;
+    int output_idx = (batch_idx * channels + c_offset) * output_depth *
+                     output_height * output_width;
+    mask += output_idx;
+    output_grad += output_idx;
+
+    for (int pd = pd_start; pd < pd_end; ++pd) {
+      for (int ph = ph_start; ph < ph_end; ++ph) {
+        for (int pw = pw_start; pw < pw_end; ++pw) {
+          if (mask[(pd * output_height + ph) * output_width + pw] ==
+              input_current_feature_map_idx)
+            gradient +=
+                output_grad[(pd * output_height + ph) * output_width + pw];
+        }
+      }
+    }
+    input_grad[index] = gradient;
+  }
+}
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T>
+class MaxPool3dWithIndexFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input.dims()[0];
+    const int input_channels = input.dims()[1];
+    const int input_depth = input.dims()[2];
+    const int input_height = input.dims()[3];
+    const int input_width = input.dims()[4];
+    const int output_channels = output.dims()[1];
+    const int output_depth = output.dims()[2];
+    const int output_height = output.dims()[3];
+    const int output_width = output.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* input_data = input.data<T>();
+    T* output_data = output.mutable_data<T>(context.GetPlace());
+    T* mask_data = mask.mutable_data<T>(context.GetPlace());
+
+    int nthreads = batch_size * output_channels * output_depth * output_height *
+                   output_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DWithIdx<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_data, output_data, mask_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width);
+  }
+};
+
+/*
+ * All tensors are in NCDHW format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ */
+template <typename T>
+class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings) {
+    const int batch_size = input_grad.dims()[0];
+    const int input_channels = input_grad.dims()[1];
+    const int input_depth = input_grad.dims()[2];
+    const int input_height = input_grad.dims()[3];
+    const int input_width = input_grad.dims()[4];
+    const int output_depth = output_grad.dims()[2];
+    const int output_height = output_grad.dims()[3];
+    const int output_width = output_grad.dims()[4];
+    const int ksize_depth = ksize[0];
+    const int ksize_height = ksize[1];
+    const int ksize_width = ksize[2];
+    const int stride_depth = strides[0];
+    const int stride_height = strides[1];
+    const int stride_width = strides[2];
+    const int padding_depth = paddings[0];
+    const int padding_height = paddings[1];
+    const int padding_width = paddings[2];
+
+    const T* output_grad_data = output_grad.data<T>();
+    const T* mask_data = mask.data<T>();
+    T* input_grad_data = input_grad.mutable_data<T>(context.GetPlace());
+
+    int nthreads =
+        batch_size * input_channels * input_depth * input_height * input_width;
+    int blocks = (nthreads + 1024 - 1) / 1024;
+    dim3 threads(1024, 1);
+    dim3 grid(blocks, 1);
+
+    KernelMaxPool3DWithIdxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(
+        nthreads, input_grad_data, output_grad_data, mask_data, input_channels,
+        input_depth, input_height, input_width, output_depth, output_height,
+        output_width, ksize_depth, ksize_height, ksize_width, stride_depth,
+        stride_height, stride_width, padding_depth, padding_height,
+        padding_width);
+  }
+};
+
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, float>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, float>;
+template class MaxPool3dWithIndexFunctor<platform::GPUPlace, double>;
+template class MaxPool3dWithIndexGradFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/pooling.h b/paddle/operators/math/pooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..c50c57b5c52cdc5c12425cb119b80502aef5451e
--- /dev/null
+++ b/paddle/operators/math/pooling.h
@@ -0,0 +1,194 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/hostdevice.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+#define FLT_MAX \
+  __FLT_MAX__  // It might need to be placed in another file, but I'm still
+               // wondering where to put it.
+
+/*
+ * \brief Extracting simple operations from pooling.
+ *        Both MaxPool and AvgPool need "initial", "compute" and "finalize"
+ * operation.
+ *        MaxPool initializes temp variable to the negative maximum to find the
+ * maximum value in the pooling field.
+ *        AvgPool initializes temp variable to the zero to accumulate all values
+ * in pool pooling, and finally takes the average.
+ *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
+ */
+template <class T>
+class MaxPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
+  DEVICE inline void compute(T& y, const T& x) { y = y > x ? y : x; }
+  DEVICE inline void finalize(T& y, const T& pool_field) {}
+};
+
+template <class T>
+class AvgPool {
+ public:
+  DEVICE inline T initial() { return static_cast<T>(0); }
+  DEVICE inline void compute(T& y, const T& x) { y += x; }
+  DEVICE inline void finalize(T& y, const T& pool_field) { y /= pool_field; }
+};
+
+template <class T>
+class MaxPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += dy * (x == y);
+  }
+};
+
+template <class T>
+class AvgPoolGrad {
+ public:
+  DEVICE inline void compute(const T& x, const T& y, const T& dy, T& dx,
+                             T scale) {
+    dx += (scale * dy);
+  }
+};
+
+/*
+ * \brief Getting pooling results, and calculating gradient.
+ *
+ * In pool2d, all tensors are in NCHW format. Where N is batch size, C is the
+ * number of channels, H and W is the height and width of feature.
+ * In pool3d, all tensors are in NCDHW format. Where N is batch size, C is the
+ * number of channels, D, H and W is the depth, height and width of feature.
+ *
+ * In max pooling, it is possible that the pooling region has multiple maximum
+ * elements. In this case, we should compute the gradient of the first maximum
+ * element.
+ * This is different from average pooling. So we rewrite the max_pool_grad:
+ * MaxPool2dGradFunctor, MaxPool3dGradFunctor.
+ */
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+
+template <typename Place, class T>
+class MaxPool2dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  std::vector<int>& ksize, std::vector<int>& strides,
+                  std::vector<int>& paddings, PoolProcess pool_compute);
+};
+
+template <typename Place, typename PoolProcess, typename T>
+class Pool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings,
+                  PoolProcess pool_compute);
+};
+
+template <typename Place, class T>
+class MaxPool3dGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& input_grad,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+/*
+ * \brief Getting max pooling results and corresponding max index, and
+ * calculating gradient.
+ * In up-sampling-pooling, it is necessary to know max element index.
+ * In pool2d, all tensors are in NCHW format. In pool3d, all tensors are in
+ * NCDHW format.
+ */
+template <typename Place, typename T>
+class MaxPool2dWithIndexFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename T>
+class MaxPool2dWithIndexGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename T>
+class MaxPool3dWithIndexFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input, framework::Tensor& output,
+                  framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+template <typename Place, typename T>
+class MaxPool3dWithIndexGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& input_grad,
+                  const framework::Tensor& output_grad,
+                  const framework::Tensor& mask, std::vector<int>& ksize,
+                  std::vector<int>& strides, std::vector<int>& paddings);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cc b/paddle/operators/math/selected_rows_functor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f2305ea16913e927dca17e5a80201368f03ca253
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.cc
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_cpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_cpu_place(out_place));
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place), out_data,
+                 boost::get<platform::CPUPlace>(in1_place), in1_data,
+                 in1_value.numel() * sizeof(T));
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(boost::get<platform::CPUPlace>(out_place),
+                 out_data + in1_value.numel(),
+                 boost::get<platform::CPUPlace>(in2_place), in2_data,
+                 in2_value.numel() * sizeof(T));
+  }
+};
+
+template struct SelectedRowsAdd<platform::CPUPlace, float>;
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    SetConstant<platform::CPUPlace, T> functor;
+    functor(context, output, 0.0);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* out_data = output->data<T>();
+
+    for (size_t i = 0; i < in1_rows.size(); i++) {
+      for (int64_t j = 0; j < in1_row_numel; j++) {
+        out_data[in1_rows[i] * in1_row_numel + j] +=
+            in1_data[i * in1_row_numel + j];
+      }
+    }
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.GetEigenDevice<platform::CPUPlace>()) =
+        out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::CPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea149ebbc12beeab43a2047372352ba769959307
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct SelectedRowsAdd<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output) {
+    auto in1_height = input1.height();
+    PADDLE_ENFORCE_EQ(in1_height, input2.height());
+    output->set_height(in1_height);
+
+    auto& in1_rows = input1.rows();
+    auto& in2_rows = input2.rows();
+    std::vector<int64_t> out_rows;
+    out_rows.reserve(in1_rows.size() + in2_rows.size());
+
+    // concat rows
+    out_rows.insert(out_rows.end(), in1_rows.begin(), in1_rows.end());
+    out_rows.insert(out_rows.end(), in2_rows.begin(), in2_rows.end());
+    output->set_rows(out_rows);
+
+    auto* out_value = output->mutable_value();
+    auto& in1_value = input1.value();
+    auto& in2_value = input2.value();
+
+    auto in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, in2_value.numel() / in2_rows.size());
+    PADDLE_ENFORCE_EQ(in1_row_numel, out_value->numel() / out_rows.size());
+
+    auto* out_data = out_value->data<T>();
+    auto* in1_data = in1_value.data<T>();
+
+    auto in1_place = input1.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in1_place));
+    auto in2_place = input2.place();
+    PADDLE_ENFORCE(platform::is_gpu_place(in2_place));
+    auto out_place = context.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(out_place));
+
+    memory::Copy(
+        boost::get<platform::GPUPlace>(out_place), out_data,
+        boost::get<platform::GPUPlace>(in1_place), in1_data,
+        in1_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+
+    auto* in2_data = in2_value.data<T>();
+    memory::Copy(
+        boost::get<platform::GPUPlace>(out_place), out_data + in1_value.numel(),
+        boost::get<platform::GPUPlace>(in2_place), in2_data,
+        in2_value.numel() * sizeof(T),
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream());
+  }
+};
+
+template struct SelectedRowsAdd<platform::GPUPlace, float>;
+
+namespace {
+template <typename T>
+__global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
+                                            const int64_t* rows, T* tensor_out,
+                                            int64_t row_numel, int block_size) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we can not use
+    // tensor_out[index] += selected_rows[index]; Instead, we have to use
+    // AtomicAdd to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SelectedRowsAddTensor<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output) {
+    auto in1_height = input1.height();
+    auto in2_dims = input2.dims();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in1_height, in2_dims[0]);
+    PADDLE_ENFORCE_EQ(in1_height, out_dims[0]);
+
+    auto& in1_value = input1.value();
+    auto& in1_rows = input1.rows();
+
+    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
+    PADDLE_ENFORCE_EQ(in1_row_numel, input2.numel() / in1_height);
+    PADDLE_ENFORCE_EQ(in1_row_numel, output->numel() / in1_height);
+
+    auto* in1_data = in1_value.data<T>();
+    auto* in2_data = input2.data<T>();
+    auto* out_data = output->data<T>();
+
+    SetConstant<platform::GPUPlace, T> functor;
+    functor(context, output, 0.0);
+
+    int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in1_rows.size());
+    SelectedRowsAddTensorKernel<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(in1_data, in1_rows.data(), out_data,
+                              in1_row_numel, block_size);
+
+    auto out_eigen = framework::EigenVector<T>::Flatten(*output);
+    auto in2_eigen = framework::EigenVector<T>::Flatten(input2);
+    out_eigen.device(*context.GetEigenDevice<platform::GPUPlace>()) =
+        out_eigen + in2_eigen;
+  }
+};
+
+template struct SelectedRowsAddTensor<platform::GPUPlace, float>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor.h b/paddle/operators/math/selected_rows_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..53ab240ca600cd4a817afa2c19fb8d9427c6f3da
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/selected_rows.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+// SelectedRows + SelectedRows will simplely concat value and rows.
+// The real computation happens in dealing with LoDTensor.
+template <typename Place, typename T>
+struct SelectedRowsAdd {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::SelectedRows& input2,
+                  framework::SelectedRows* output);
+};
+
+template <typename Place, typename T>
+struct SelectedRowsAddTensor {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input1,
+                  const framework::Tensor& input2, framework::Tensor* output);
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cc b/paddle/operators/math/selected_rows_functor_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f7760cb713b6bf58c82f38fb043d7d53d82710a
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor_test.cc
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/selected_rows_functor.h"
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+
+TEST(selected_rows_functor, cpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  CPUPlace cpu_place;
+  CPUDeviceContext ctx(cpu_place);
+  SetConstant<CPUPlace, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), cpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), cpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), cpu_place);
+
+  SelectedRowsAdd<CPUPlace, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  auto* out_data = output->value().data<float>();
+  // input1 value
+  EXPECT_EQ(out_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), cpu_place);
+
+  SelectedRowsAddTensor<CPUPlace, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  auto* tensor2_data = tensor2->data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..69607c5afc46921c08ce278bf164e5bed7b446f8
--- /dev/null
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/selected_rows_functor.h"
+
+TEST(selected_rows_functor, gpu_add) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators::math;
+
+  GPUPlace gpu_place(0);
+  CPUPlace cpu_place;
+  CUDADeviceContext ctx(gpu_place);
+  SetConstant<GPUPlace, float> functor;
+  int64_t height = 10;
+  int64_t row_numel = 10;
+
+  std::vector<int64_t> rows1{0, 4, 7};
+  std::unique_ptr<SelectedRows> selected_rows1{new SelectedRows(rows1, height)};
+  auto* in1_value = selected_rows1->mutable_value();
+  in1_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), gpu_place);
+  functor(ctx, in1_value, 1.0);
+
+  std::vector<int64_t> rows2{0, 5, 7, 9};
+  std::unique_ptr<SelectedRows> selected_rows2{new SelectedRows(rows2, height)};
+  auto* in2_value = selected_rows2->mutable_value();
+  in2_value->mutable_data<float>(
+      make_ddim({static_cast<int64_t>(rows2.size()), row_numel}), gpu_place);
+  functor(ctx, in2_value, 2.0);
+
+  std::unique_ptr<SelectedRows> output{new SelectedRows()};
+  auto* out_value = output->mutable_value();
+
+  // simplely concat two SelectedRows
+  out_value->mutable_data<float>(make_ddim({7, 10}), gpu_place);
+
+  SelectedRowsAdd<GPUPlace, float> add_functor;
+  add_functor(ctx, *selected_rows1, *selected_rows2, output.get());
+
+  auto out_height = output->height();
+  EXPECT_EQ(out_height, height);
+
+  auto& out_rows = output->rows();
+
+  // input1 rows
+  EXPECT_EQ(out_rows[0], 0);
+  EXPECT_EQ(out_rows[1], 4);
+  EXPECT_EQ(out_rows[2], 7);
+  // input2 rows
+  EXPECT_EQ(out_rows[3], 0);
+  EXPECT_EQ(out_rows[4], 5);
+  EXPECT_EQ(out_rows[5], 7);
+  EXPECT_EQ(out_rows[6], 9);
+
+  Tensor out_cpu;
+  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  ctx.Wait();
+
+  auto* out_cpu_data = out_cpu.data<float>();
+  // input1 value
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 0], 1.0);
+  EXPECT_EQ(out_cpu_data[0 * row_numel + 8], 1.0);
+  EXPECT_EQ(out_cpu_data[1 * row_numel + 1], 1.0);
+  EXPECT_EQ(out_cpu_data[2 * row_numel + 6], 1.0);
+  // input2 value
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 3], 2.0);
+  EXPECT_EQ(out_cpu_data[3 * row_numel + 8], 2.0);
+  EXPECT_EQ(out_cpu_data[4 * row_numel + 4], 2.0);
+  EXPECT_EQ(out_cpu_data[5 * row_numel + 7], 2.0);
+  EXPECT_EQ(out_cpu_data[6 * row_numel + 9], 2.0);
+
+  std::unique_ptr<Tensor> tensor1{new Tensor()};
+  tensor1->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+  functor(ctx, tensor1.get(), 3.0);
+
+  std::unique_ptr<Tensor> tensor2{new Tensor()};
+  tensor2->mutable_data<float>(make_ddim({height, row_numel}), gpu_place);
+
+  SelectedRowsAddTensor<GPUPlace, float> add_tensor_functor;
+  add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
+
+  Tensor tensor2_cpu;
+  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
+  ctx.Wait();
+
+  auto* tensor2_cpu_data = tensor2_cpu.data<float>();
+  // row0: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[0 * row_numel + 0], 6.0);
+  // row1: 3.0
+  EXPECT_EQ(tensor2_cpu_data[1 * row_numel + 1], 3.0);
+  // row4 : 1.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[4 * row_numel + 6], 4.0);
+  // row5: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[5 * row_numel + 7], 5.0);
+  // row6: 3.0
+  EXPECT_EQ(tensor2_cpu_data[6 * row_numel + 1], 3.0);
+  // row7: 1.0 + 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[7 * row_numel + 3], 6.0);
+  // row9: 2.0 + 3.0
+  EXPECT_EQ(tensor2_cpu_data[9 * row_numel + 6], 5.0);
+}
diff --git a/paddle/operators/math/sequence2batch.cc b/paddle/operators/math/sequence2batch.cc
new file mode 100644
index 0000000000000000000000000000000000000000..10c6e105b950b9d510e7a14828d72531e8eb0028
--- /dev/null
+++ b/paddle/operators/math/sequence2batch.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& src, const size_t* index,
+                  framework::LoDTensor& dst, bool is_src_index) {
+    auto src_dims = src.dims();
+    auto dst_dims = dst.dims();
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2UL,
+                      "The src must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(dst_dims.size(), 2UL,
+                      "The dst must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                      "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst.data<T>();
+    for (int i = 0; i < height; ++i) {
+      if (is_src_index) {
+        memcpy(dst_data + i * width, src_data + index[i] * width,
+               width * sizeof(T));
+      } else {
+        memcpy(dst_data + index[i] * width, src_data + i * width,
+               width * sizeof(T));
+      }
+    }
+  }
+};
+
+template class CopyMatrixRowsFunctor<platform::CPUPlace, float>;
+template class CopyMatrixRowsFunctor<platform::CPUPlace, double>;
+
+template class LoDTensor2BatchFunctor<platform::CPUPlace, float>;
+template class LoDTensor2BatchFunctor<platform::CPUPlace, double>;
+template class Batch2LoDTensorFunctor<platform::CPUPlace, float>;
+template class Batch2LoDTensorFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence2batch.cu b/paddle/operators/math/sequence2batch.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4f349946785171e6c59b22163ba76791c7244f88
--- /dev/null
+++ b/paddle/operators/math/sequence2batch.cu
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/sequence2batch.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename T, int BlockDimX, int BlockDimY, int GridDimX>
+__global__ void CopyMatrixRowsKernel(const T* src, T* dst, const size_t* index,
+                                     int64_t height, int64_t width,
+                                     bool is_src_index) {
+  int idx = threadIdx.x;
+  int idy = threadIdx.y;
+  int id = blockIdx.x + idy * GridDimX;
+  while (id < height) {
+    int src_idx = is_src_index ? index[id] : id;
+    int dst_idx = is_src_index ? id : index[id];
+    const T* src_data = src + src_idx * width;
+    T* dst_data = dst + dst_idx * width;
+    for (int i = idx; i < width; i += BlockDimX) {
+      dst_data[i] = src_data[i];
+    }
+    id += BlockDimY * GridDimX;
+  }
+}
+
+template <typename T>
+class CopyMatrixRowsFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& src, const size_t* index,
+                  framework::LoDTensor& dst, bool is_src_index) {
+    auto src_dims = src.dims();
+    auto dst_dims = dst.dims();
+    PADDLE_ENFORCE_EQ(src_dims.size(), 2,
+                      "The src must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(dst_dims.size(), 2,
+                      "The dst must be matrix with rank 2.");
+    PADDLE_ENFORCE_EQ(src_dims[1], dst_dims[1],
+                      "The width of src and dst must be same.");
+    auto height = dst_dims[0];
+    auto width = dst_dims[1];
+    auto* src_data = src.data<T>();
+    auto* dst_data = dst.data<T>();
+
+    dim3 threads(128, 8);
+    dim3 grid(8, 1);
+    auto stream =
+        reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
+    CopyMatrixRowsKernel<T, 128, 8, 8><<<grid, threads, 0, stream>>>(
+        src_data, dst_data, index, height, width, is_src_index);
+  }
+};
+
+template class CopyMatrixRowsFunctor<platform::GPUPlace, float>;
+template class CopyMatrixRowsFunctor<platform::GPUPlace, double>;
+
+template class LoDTensor2BatchFunctor<platform::GPUPlace, float>;
+template class LoDTensor2BatchFunctor<platform::GPUPlace, double>;
+template class Batch2LoDTensorFunctor<platform::GPUPlace, float>;
+template class Batch2LoDTensorFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/sequence2batch.h b/paddle/operators/math/sequence2batch.h
new file mode 100644
index 0000000000000000000000000000000000000000..03cd018e46e90c9bbe689c9686377e0e998ee513
--- /dev/null
+++ b/paddle/operators/math/sequence2batch.h
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <typename Place, typename T>
+class CopyMatrixRowsFunctor {
+ public:
+  // If is_src_index is true,
+  // copy the indexed rows of input src to the output dst.
+  // If is_src_index is false,
+  // copy the input src to the indexed rows of output dst.
+  // The indexed rows are based on the input index.
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& src, const size_t* index,
+                  framework::LoDTensor& dst, bool is_src_index);
+};
+
+template <typename Place, typename T>
+class LoDTensor2BatchFunctor {
+  // Calculate the length of each sequence and
+  // sort sequence index by the length.
+  // example:  sequences = {s0, s1, s2}
+  //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+  //           seq_info[3] = {(4, 5, 1), (0, 4, 0), (9, 3, 2)}
+  //
+  struct SeqInfo {
+    SeqInfo(int start, int length, int seq_idx)
+        : start(start), length(length), seq_idx(seq_idx) {}
+    int start;
+    int length;
+    int seq_idx;
+  };
+
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& lod_tensor,
+                  framework::LoDTensor& batch, bool is_reverse) const {
+    auto lods = lod_tensor.lod();
+    PADDLE_ENFORCE_EQ(lods.size(), 1UL, "Only support one level sequence now.");
+    auto lod = lods[0];
+
+    std::vector<SeqInfo> seq_info;
+    for (size_t seq_id = 0; seq_id < lod.size() - 1; ++seq_id) {
+      int length = lod[seq_id + 1] - lod[seq_id];
+      seq_info.emplace_back(lod[seq_id], length, seq_id);
+    }
+
+    std::sort(seq_info.begin(), seq_info.end(),
+              [](SeqInfo a, SeqInfo b) { return a.length > b.length; });
+
+    // calculate the start position of each batch
+    // (numBatch equal the maxLength of sequences)
+    // example:  sequences = {s0, s1, s2}
+    //           s0: 0 0 0 0, s1: 1 1 1 1 1, s2: 2 2 2
+    //           num_batch = 5,
+    //           batchIndex = {b0, b1, b2, b3, b4}
+    //           b0: 1 0 2, b1: 1 0 2, b2: 1 0 2, b3: 1 0, b4: 1
+    //           batch_start_positions[6] = {0, 3, 6, 9, 11, 12}
+    //              batch_start_positions[0] = len(b0)
+    //              batch_start_positions[1] = len(b0) + len(b1)
+    //              batch_start_positions[2] = len(b0) + len(b1) + len(b2)
+    //              ...
+    //           seq2batch_idx[12] = {4, 0, 9,
+    //                                5, 1, 10,
+    //                                6, 2, 11,
+    //                                7, 3,
+    //                                8}
+    // The batch number represents batch size after rearranging the
+    // input LodTensor. It is also the maximum length of input sequence.
+
+    paddle::framework::LoD batch_lods;
+    batch_lods.emplace_back(std::vector<size_t>{0});
+    batch_lods.emplace_back(std::vector<size_t>{0});
+
+    // batch_lods[0] is the start positions for batch LoDTensor
+    int num_batch = seq_info[0].length;
+    batch_lods[0].resize(static_cast<size_t>(num_batch + 1));
+    // batch_lods[1] is the raw index in the input LoDTensor
+    auto dims = lod_tensor.dims();
+    batch_lods[1].resize(static_cast<size_t>(dims[0]));
+
+    size_t* batch_starts = batch_lods[0].data();
+    size_t* seq2batch_idx = batch_lods[1].data();
+    batch_starts[0] = 0;
+    for (size_t n = 0; n < num_batch; n++) {
+      auto batch_id = static_cast<int>(batch_starts[n]);
+      for (size_t i = 0; i < seq_info.size(); ++i) {
+        size_t seq_len = seq_info[i].length;
+        int start = seq_info[i].start;
+        if (n < seq_len) {
+          seq2batch_idx[batch_id] =
+              is_reverse ? start + seq_len - 1 - n : start + n;
+          batch_id++;
+        } else {
+          break;
+        }
+      }
+      batch_starts[n + 1] = static_cast<size_t>(batch_id);
+    }
+    batch.set_lod(batch_lods);
+
+    CopyMatrixRowsFunctor<Place, T> to_batch;
+    to_batch(context, lod_tensor, seq2batch_idx, batch, true);
+  }
+};
+
+template <typename Place, typename T>
+class Batch2LoDTensorFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::LoDTensor& batch,
+                  framework::LoDTensor& lod_tensor) const {
+    auto in_lod = batch.lod();
+    PADDLE_ENFORCE_EQ(in_lod.size(), 2UL,
+                      "The LoD size of input `batch` should be 2.");
+    auto out_lod = lod_tensor.lod()[0];
+    auto num = out_lod[out_lod.size() - 1];
+    PADDLE_ENFORCE_EQ(num, lod_tensor.dims()[0]);
+    PADDLE_ENFORCE_EQ(num, in_lod[1].size());
+    PADDLE_ENFORCE_EQ(num, batch.dims()[0]);
+    CopyMatrixRowsFunctor<Place, T> to_seq;
+    size_t* index = in_lod[1].data();
+    to_seq(context, batch, index, lod_tensor, false);
+  }
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/softmax.cc b/paddle/operators/math/softmax.cc
index ac9f3c4bf61bf8e13faa17387f1112756db9a100..0ba8197ab8b64649c8adcf67771ba01eca7f1d10 100644
--- a/paddle/operators/math/softmax.cc
+++ b/paddle/operators/math/softmax.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/math/softmax.h"
 
@@ -19,6 +19,7 @@ namespace operators {
 namespace math {
 
 template class SoftmaxFunctor<platform::CPUPlace, float>;
+template class SoftmaxGradFunctor<platform::CPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.cu b/paddle/operators/math/softmax.cu
index 4c3df0550e7ca6f4310db1d35cc34d5c73a2dd16..99f988d51e4b16c3f3bfd9c76b411bb53619603e 100644
--- a/paddle/operators/math/softmax.cu
+++ b/paddle/operators/math/softmax.cu
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #define EIGEN_USE_GPU
 
@@ -21,6 +21,7 @@ namespace operators {
 namespace math {
 
 template class SoftmaxFunctor<platform::GPUPlace, float>;
+template class SoftmaxGradFunctor<platform::GPUPlace, float>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/operators/math/softmax.h b/paddle/operators/math/softmax.h
index 3d2f0d0aecffcd0fe51166c3d863aa8b91bba196..b7f627eee7f8fe68a83595a3390a55d438c97afb 100644
--- a/paddle/operators/math/softmax.h
+++ b/paddle/operators/math/softmax.h
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #pragma once
 #include "paddle/framework/eigen.h"
@@ -36,7 +36,7 @@ struct ValueClip {
 template <typename Place, typename T>
 class SoftmaxFunctor {
  public:
-  void operator()(const framework::ExecutionContext& context,
+  void operator()(const platform::DeviceContext& context,
                   const framework::Tensor* X, framework::Tensor* Y) {
     auto logits = EigenMatrix<T>::From(*X);
     auto softmax = EigenMatrix<T>::From(*Y);
@@ -58,8 +58,8 @@ class SoftmaxFunctor {
                                .broadcast(one_by_class))
                               .unaryExpr(ValueClip<T>());
 
-    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
-    softmax.device(context.GetEigenDevice<Place>()) =
+    softmax.device(*context.GetEigenDevice<Place>()) = shifted_logits.exp();
+    softmax.device(*context.GetEigenDevice<Place>()) =
         (softmax *
          softmax.sum(along_class)
              .inverse()
@@ -68,6 +68,37 @@ class SoftmaxFunctor {
              .broadcast(one_by_class));
   }
 };
+
+template <typename Place, typename T>
+class SoftmaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor* y, const framework::Tensor* y_grad,
+                  framework::Tensor* x_grad) {
+    auto softmax = EigenMatrix<T>::From(*y);
+    auto softmax_grad = EigenMatrix<T>::From(*y_grad);
+    auto logits_grad = EigenMatrix<T>::From(*x_grad);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = softmax.dimension(kBatchDim);
+    const int num_classes = softmax.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto dot = (softmax * softmax_grad)
+                   .sum(along_class)
+                   .eval()
+                   .reshape(batch_by_one)
+                   .broadcast(one_by_class);
+    logits_grad.device(*context.GetEigenDevice<Place>()) =
+        (softmax_grad - dot) * softmax;
+  }
+};
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cc b/paddle/operators/math/vol2col.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9718a047381596a1570b4b00546622968b70227
--- /dev/null
+++ b/paddle/operators/math/vol2col.cc
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+/*
+ * vol = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& vol, framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    const T* vol_data = vol.data<T>();
+    T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int c_in = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * stride_depth - padding_depth + d_offset;
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * stride_height - padding_height + h_offset;
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * stride_width - padding_width + w_offset;
+
+            int col_idx =
+                ((c * output_depth + d) * output_height + h) * output_width + w;
+            if (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
+                w_pad >= input_width || d_pad < 0 || d_pad >= input_depth) {
+              col_data[col_idx] = static_cast<T>(0);
+            } else {
+              int vol_idx =
+                  ((c_in * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+              col_data[col_idx] = vol_data[vol_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * vol = [input_channels,input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& vol, const framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+    int channels_col =
+        input_channels * filter_depth * filter_height * filter_width;
+
+    T* vol_data = vol.data<T>();
+    const T* col_data = col.data<T>();
+
+    for (int c = 0; c < channels_col; ++c) {
+      int w_offset = c % filter_width;
+      int h_offset = (c / filter_width) % filter_height;
+      int d_offset = (c / filter_width / filter_height) % filter_depth;
+      int cIm = c / filter_width / filter_height / filter_depth;
+      for (int d = 0; d < output_depth; ++d) {
+        int d_pad = d * stride_depth - padding_depth + d_offset;
+        for (int h = 0; h < output_height; ++h) {
+          int h_pad = h * stride_height - padding_height + h_offset;
+          for (int w = 0; w < output_width; ++w) {
+            int w_pad = w * stride_width - padding_width + w_offset;
+
+            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
+                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
+              int vol_idx =
+                  ((cIm * input_depth + d_pad) * input_height + h_pad) *
+                      input_width +
+                  w_pad;
+              int col_idx =
+                  ((c * output_depth + d) * output_height + h) * output_width +
+                  w;
+              vol_data[vol_idx] += col_data[col_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Vol2ColFunctor<platform::CPUPlace, float>;
+template class Vol2ColFunctor<platform::CPUPlace, double>;
+template class Col2VolFunctor<platform::CPUPlace, float>;
+template class Col2VolFunctor<platform::CPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.cu b/paddle/operators/math/vol2col.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27b11fb237575fd25a789a5fcc24ed4e30607009
--- /dev/null
+++ b/paddle/operators/math/vol2col.cu
@@ -0,0 +1,204 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+
+template <class T>
+__global__ void vol2col(int num_kernels, const T* data_vol, int depth,
+                        int height, int width, int filter_depth,
+                        int filter_height, int filter_width, int stride_depth,
+                        int stride_height, int stride_width, int padding_depth,
+                        int padding_height, int padding_width, int output_detph,
+                        int output_height, int output_width, T* data_col) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    int w_out = index % output_width;
+    int h_out = (index / output_width) % output_height;
+    int d_out = (index / output_width / output_height) % output_detph;
+    int channel_in = index / output_width / output_height / output_detph;
+    int channel_out = channel_in * filter_depth * filter_height * filter_width;
+    int w_in = w_out * stride_width - padding_width;
+    int h_in = h_out * stride_height - padding_height;
+    int d_in = d_out * stride_depth - padding_depth;
+
+    data_col += ((channel_out * output_detph + d_out) * output_height + h_out) *
+                    output_width +
+                w_out;
+    data_vol += ((channel_in * depth + d_in) * height + h_in) * width + w_in;
+    for (int k = 0; k < filter_depth; ++k) {
+      for (int i = 0; i < filter_height; ++i) {
+        for (int j = 0; j < filter_width; ++j) {
+          int d = d_in + k;
+          int h = h_in + i;
+          int w = w_in + j;
+          *data_col = (d >= 0 && d < depth && h >= 0 && h < height && w >= 0 &&
+                       w < width)
+                          ? data_vol[(k * height + i) * width + j]
+                          : 0;
+          data_col += output_detph * output_height * output_width;
+        }
+      }
+    }
+  }
+}
+
+/*
+ * im = [input_channels,intpu_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Vol2ColFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& vol, framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+
+    int num_outputs =
+        input_channels * output_depth * output_height * output_width;
+
+    const int threads = 1024;
+    const int blocks = (num_outputs + 1024 - 1) / 1024;
+    vol2col<T><<<blocks, threads, 0,
+                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                     .stream()>>>(
+        num_outputs, vol.data<T>(), input_depth, input_height, input_width,
+        filter_depth, filter_height, filter_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        output_depth, output_height, output_width, col.data<T>());
+  }
+};
+
+template <class T>
+__global__ void col2vol(int num_kernels, const T* data_col, int depth,
+                        int height, int width, int filter_depth,
+                        int filter_height, int filter_width, int stride_depth,
+                        int stride_height, int stride_width, int padding_depth,
+                        int padding_height, int padding_width, int output_detph,
+                        int output_height, int output_width, T* data_vol) {
+  for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < num_kernels;
+       index += blockDim.x * gridDim.x) {
+    T src_val = 0;
+    int w = index % width + padding_width;
+    int h = (index / width) % height + padding_height;
+    int d = (index / width / height) % depth + padding_depth;
+    int c = index / width / height / depth;
+    // compute the start and end of the output
+    int w_col_start =
+        (w < filter_width) ? 0 : (w - filter_width) / stride_width + 1;
+    int w_col_end = min(w / stride_width + 1, output_width);
+    int h_col_start =
+        (h < filter_height) ? 0 : (h - filter_height) / stride_height + 1;
+    int h_col_end = min(h / stride_height + 1, output_height);
+    int d_col_start =
+        (d < filter_depth) ? 0 : (d - filter_depth) / stride_depth + 1;
+    int d_col_end = min(d / stride_depth + 1, output_detph);
+
+    int offset = (c * filter_depth * filter_height * filter_width +
+                  d * filter_width * filter_height + h * filter_width + w) *
+                 output_detph * output_height * output_width;
+
+    int coeff_d_col =
+        (1 - stride_depth * filter_width * filter_height * output_detph) *
+        output_height * output_width;
+    int coeff_h_col =
+        (1 - stride_height * filter_width * output_detph * output_height) *
+        output_width;
+    int coeff_w_col =
+        (1 - stride_width * output_detph * output_height * output_width);
+
+    for (int d_col = d_col_start; d_col < d_col_end; ++d_col) {
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          src_val += data_col[offset + d_col * coeff_d_col +
+                              h_col * coeff_h_col + w_col * coeff_w_col];
+        }
+      }
+    }
+    data_vol[index] = src_val;
+  }
+}
+
+/*
+ * im = [input_channels, input_depth, input_height, input_width]
+ * col =
+ *   [input_channels, filter_depth, filter_height, filter_width,
+ *                    output_depth, output_height, output_width]
+ */
+template <class T>
+class Col2VolFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& vol, const framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const {
+    PADDLE_ENFORCE(vol.dims().size() == 4);
+    PADDLE_ENFORCE(col.dims().size() == 7);
+
+    int input_channels = vol.dims()[0];
+    int input_depth = vol.dims()[1];
+    int input_height = vol.dims()[2];
+    int input_width = vol.dims()[3];
+    int filter_depth = col.dims()[1];
+    int filter_height = col.dims()[2];
+    int filter_width = col.dims()[3];
+    int output_depth = col.dims()[4];
+    int output_height = col.dims()[5];
+    int output_width = col.dims()[6];
+
+    int num_kernels = input_channels * input_depth * input_height * input_width;
+
+    const int threads = 1024;
+    const int blocks = (num_kernels + 1024 - 1) / 1024;
+
+    col2vol<T><<<blocks, threads, 0,
+                 reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                     .stream()>>>(
+        num_kernels, col.data<T>(), input_depth, input_height, input_width,
+        filter_depth, filter_height, filter_width, stride_depth, stride_height,
+        stride_width, padding_depth, padding_height, padding_width,
+        output_depth, output_height, output_width, vol.data<T>());
+  }
+};
+
+template class Vol2ColFunctor<platform::GPUPlace, float>;
+template class Vol2ColFunctor<platform::GPUPlace, double>;
+template class Col2VolFunctor<platform::GPUPlace, float>;
+template class Col2VolFunctor<platform::GPUPlace, double>;
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
new file mode 100644
index 0000000000000000000000000000000000000000..f022365a16fbf61981e94bedbd8b21a32887b235
--- /dev/null
+++ b/paddle/operators/math/vol2col.h
@@ -0,0 +1,78 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+/*
+ * \brief Converts the feature data of four dimensions(CDHW) into a colData of
+ *        seven dimensions in the Vol2ColFunctor calculation,
+ *        And in the Col2VolFunctor calculation, it is reversed.
+ *
+ * \param volData   Vol data.
+ * \param volShape  The shape of volData,
+ *                 [input_channels, input_depth, input_height, input_width].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * The shape of colData is:
+ * [input_channels, filter_depth, filter_height, filter_width, output_depth,
+ * output_height, output_width]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * input_channels * filter_depth * filter_height * filter_width, and the width
+ * is equal output_depth * output_height * output_width.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [input_channels,
+ *      filter_depth,
+ *      filter_height,
+ *      filter_width,      ======>      [height, width]
+ *      output_depth,
+ *      output_height,
+ *      output_width]
+ *
+ * \note The caller needs to ensure that volShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <typename Place, typename T>
+class Vol2ColFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& vol, framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const;
+};
+
+template <typename Place, typename T>
+class Col2VolFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  framework::Tensor& vol, const framework::Tensor& col,
+                  int stride_depth, int stride_height, int stride_width,
+                  int padding_depth, int padding_height,
+                  int padding_width) const;
+};
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..74590d17cd0f974f830e760d85daef8ab5318a43
--- /dev/null
+++ b/paddle/operators/math/vol2col_test.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/vol2col.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+template <typename Place>
+void testVol2col() {
+  paddle::framework::Tensor input;
+  paddle::framework::Tensor input_tmp;
+  paddle::framework::Tensor output;
+  paddle::framework::Tensor output_tmp;
+
+  auto* place = new Place();
+  paddle::platform::DeviceContext* context;
+  if (paddle::platform::is_cpu_place(*place)) {
+    context =
+        new paddle::platform::CPUDeviceContext(paddle::platform::CPUPlace());
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    context =
+        new paddle::platform::CUDADeviceContext(paddle::platform::GPUPlace());
+#else
+    PADDLE_THROW("no GPU support");
+#endif  // PADDLE_WITH_CUDA
+  }
+
+  /**
+   * input = [[0, 1, 2,
+   *          3, 4, 5]
+   *          [6, 7, 8,
+   *          9, 10, 11]]
+   *
+   * output = [0, 1
+   *           1, 2
+   *           3, 4
+   *           4, 5
+   *           6, 7
+   *           7, 8
+   *           9, 10
+   *           10, 11]
+   *
+   * col2vol = [[0, 2, 2,
+   *             3, 8, 5]
+   *            [6, 14, 8,
+   *             9, 20, 11]]
+   *
+   */
+  int input_depth = 2;
+  int input_height = 2;
+  int input_width = 3;
+  int filter_size = 2;
+  int stride = 1;
+  int padding = 0;
+  int output_depth = (input_depth - filter_size + 2 * padding) / stride + 1;
+  int output_height = (input_height - filter_size + 2 * padding) / stride + 1;
+  int output_width = (input_width - filter_size + 2 * padding) / stride + 1;
+
+  // Vol2Col test
+  float* input_ptr =
+      input_tmp.mutable_data<float>({1, input_depth, input_height, input_width},
+                                    paddle::platform::CPUPlace());
+  float arr[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+  memcpy(input_ptr, arr, 12 * sizeof(float));
+
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom(input_tmp, *place, *context);
+  }
+  output.mutable_data<float>({1, filter_size, filter_size, filter_size,
+                              output_depth, output_height, output_width},
+                             *place);
+
+  paddle::operators::math::Vol2ColFunctor<Place, float> vol2col;
+  vol2col(*context, input, output, stride, stride, stride, padding, padding,
+          padding);
+
+  float vol_2_col[] = {0, 1, 1, 2, 3, 4, 4, 5, 6, 7, 7, 8, 9, 10, 10, 11};
+  float* out_cfo_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    out_cfo_ptr = output.data<float>();
+  } else {
+    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
+    out_cfo_ptr = output_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 16; ++i) {
+    EXPECT_EQ(out_cfo_ptr[i], vol_2_col[i]);
+  }
+
+  // Col2Vol test
+  float col_2_vol[] = {0, 2, 2, 3, 8, 5, 6, 14, 8, 9, 20, 11};
+  memset(input_ptr, 0, 12 * sizeof(float));
+  if (paddle::platform::is_cpu_place(*place)) {
+    input = input_tmp;
+  } else {
+    input.CopyFrom(input_tmp, *place, *context);
+  }
+
+  paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
+  col2vol(*context, input, output, stride, stride, stride, padding, padding,
+          padding);
+
+  float* in_ptr;
+  if (paddle::platform::is_cpu_place(*place)) {
+    in_ptr = input.data<float>();
+  } else {
+    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    in_ptr = input_tmp.data<float>();
+  }
+
+  for (int i = 0; i < 12; ++i) {
+    EXPECT_EQ(in_ptr[i], col_2_vol[i]);
+  }
+}
+
+TEST(math, vol2col) {
+  testVol2col<paddle::platform::CPUPlace>();
+#ifdef PADDLE_WITH_CUDA
+  testVol2col<paddle::platform::GPUPlace>();
+#endif  // PADDLE_WITH_CUDA
+}
diff --git a/paddle/operators/matmul_op.cc b/paddle/operators/matmul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5ecbee3b413617e3a5523d9a32e72bc08bd316c5
--- /dev/null
+++ b/paddle/operators/matmul_op.cc
@@ -0,0 +1,208 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/matmul_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class MatMulOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"),
+                   "Input(X) of MatMulOp should not be null.");
+    PADDLE_ENFORCE(context->HasInput("Y"),
+                   "Input(Y) of MatMulOp should not be null.");
+    PADDLE_ENFORCE(context->HasOutput("Out"),
+                   "Output(Out) of MatMulOp should not be null.");
+
+    auto dim_x = context->GetInputDim("X");
+    auto dim_y = context->GetInputDim("Y");
+    bool transpose_x = context->Attrs().Get<bool>("transpose_X");
+    bool transpose_y = context->Attrs().Get<bool>("transpose_Y");
+
+    PADDLE_ENFORCE_GE(dim_x.size(), 1,
+                      "Input tensor X must be at least 1-dimensional.");
+    PADDLE_ENFORCE_GE(dim_y.size(), 1,
+                      "Input tensor Y must be at least 1-dimensional.");
+    PADDLE_ENFORCE_LE(dim_x.size(), 3,
+                      "Input tensor X must be at most 3-dimensional.");
+    PADDLE_ENFORCE_LE(dim_y.size(), 3,
+                      "Input tensor Y must be at most 3-dimensional.");
+
+    int M = 0, N = 0, KX = 0, KY = 0, batchCountX = 0, batchCountY = 0;
+    bool remove_initial_dim = false, remove_final_dim = false;
+
+    switch (dim_x.size()) {
+      case 1:
+        if (transpose_x) {
+          M = dim_x[0];
+          KX = 1;
+        } else {
+          M = 1;
+          KX = dim_x[0];
+          remove_initial_dim = true;
+        }
+        break;
+      case 2:
+        M = transpose_x ? dim_x[1] : dim_x[0];
+        KX = transpose_x ? dim_x[0] : dim_x[1];
+        break;
+      case 3:
+        batchCountX = dim_x[0];
+        M = transpose_x ? dim_x[2] : dim_x[1];
+        KX = transpose_x ? dim_x[1] : dim_x[2];
+        break;
+      default:
+        assert(false);
+    }
+
+    switch (dim_y.size()) {
+      case 1:
+        if (transpose_y) {
+          N = dim_y[0];
+          KY = 1;
+        } else {
+          N = 1;
+          KY = dim_y[0];
+          remove_final_dim = true;
+        }
+        break;
+      case 2:
+        KY = transpose_y ? dim_y[1] : dim_y[0];
+        N = transpose_y ? dim_y[0] : dim_y[1];
+        break;
+      case 3:
+        batchCountY = dim_y[0];
+        KY = transpose_y ? dim_y[2] : dim_y[1];
+        N = transpose_y ? dim_y[1] : dim_y[2];
+        break;
+      default:
+        assert(false);
+    }
+
+    PADDLE_ENFORCE_EQ(
+        KX, KY,
+        "First matrix's width must be equal with second matrix's height.");
+    if (batchCountX && batchCountY) {
+      PADDLE_ENFORCE_EQ(
+          batchCountX, batchCountY,
+          "When Input(X) and Input(Y) are both three dimensional, they "
+          "must have the same batch dimension.");
+    }
+    int batchCount = std::max(batchCountX, batchCountY);
+
+    std::vector<int64_t> dim_out;
+    if (batchCount) {
+      dim_out.push_back(batchCount);
+    }
+    if (!remove_initial_dim) {
+      dim_out.push_back(M);
+    }
+    if (!remove_final_dim) {
+      dim_out.push_back(N);
+    }
+    if (dim_out.size() == 0) {
+      // We don't support 0-dimensional Tensors (scalars), so instead
+      // treat the output as a Tensor of shape (1, ) in this case.
+      dim_out.push_back(1);
+    }
+    context->SetOutputDim("Out", framework::make_ddim(dim_out));
+    context->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class MatMulOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MatMulOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of MatMul op");
+    AddInput("Y", "The second input of MatMul op");
+    AddOutput("Out", "The output of MatMul op");
+    AddAttr<bool>("transpose_X",
+                  R"DOC(If true, use the transpose of `X`.
+        )DOC")
+        .SetDefault(false);
+    AddAttr<bool>("transpose_Y",
+                  R"DOC(If true, use the transpose of `Y`.
+        )DOC")
+        .SetDefault(false);
+    AddComment(R"DOC(
+The MatMul operator is used to perform (batched) matrix multiplication
+over the last two dimensions of the input tensors `X` and `Y`.
+
+If a transpose flag is specified, the last two dimensions of the
+tensor are transposed. If the tensor is rank-1 of shape [D], then
+for `X` it is treated as [1, D] in nontransposed form and as [D, 1]
+in transposed form, whereas for `Y` it is the opposite: It is treated
+as [D, 1] in nontransposed form and as [1, D] in transposed form.
+
+Examples without transpose:
+- X: [K], Y: [K] => Out: [1]
+- X: [K], Y: [K, N] => Out: [N]
+- X: [B, M, K], Y: [K] => Out: [B, M]
+- X: [M, K], Y: [B, K, N] => Out: [B, M, N]
+- X: [B, M, K], Y: [B, K, N] => Out: [B, M, N]
+
+The behavior is designed to be similar to the `numpy.matmul` function.
+The differences are:
+- Currently only rank 1 to rank 3 input tensors are supported.
+- We add `transpose_X` and `transpose_Y` flags.
+
+Both the input `X` and `Y` can carry the LoD (Level of Details) information,
+or not. But the output only shares the LoD with input `X`.
+)DOC");
+  }
+};
+
+class MatMulOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("X"), "Input(X) should not be null");
+    PADDLE_ENFORCE(context->HasInput("Y"), "Input(Y) should not be null");
+    PADDLE_ENFORCE(context->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) should not be null");
+    auto x_dims = context->GetInputDim("X");
+    auto y_dims = context->GetInputDim("Y");
+
+    auto x_grad_name = framework::GradVarName("X");
+    auto y_grad_name = framework::GradVarName("Y");
+
+    if (context->HasOutput(x_grad_name)) {
+      context->SetOutputDim(x_grad_name, x_dims);
+    }
+    if (context->HasOutput(y_grad_name)) {
+      context->SetOutputDim(y_grad_name, y_dims);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(matmul, ops::MatMulOp, ops::MatMulOpMaker, matmul_grad,
+            ops::MatMulOpGrad);
+REGISTER_OP_CPU_KERNEL(matmul,
+                       ops::MatMulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    matmul_grad, ops::MatMulGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/matmul_op.cu b/paddle/operators/matmul_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b7e66382f00445b087e14103e7a148d450b37405
--- /dev/null
+++ b/paddle/operators/matmul_op.cu
@@ -0,0 +1,21 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/matmul_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(matmul,
+                       ops::MatMulKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    matmul_grad, ops::MatMulGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/matmul_op.h b/paddle/operators/matmul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..5ce30740c90b5cd0bd4f8ab183cf985ed5d827c1
--- /dev/null
+++ b/paddle/operators/matmul_op.h
@@ -0,0 +1,228 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   You may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/matmul.h"
+#include "paddle/operators/transpose_op.h"
+
+namespace paddle {
+namespace operators {
+namespace matmul_detail {
+
+using Tensor = framework::Tensor;
+using DDim = framework::DDim;
+using framework::make_ddim;
+using framework::vectorize;
+
+template <typename Place, typename T>
+class MatMulKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    Tensor* out = context.Output<Tensor>("Out");
+    out->mutable_data<T>(context.GetPlace());
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    math::MatMulFunctor<Place, T>()(context.device_context(), x, transpose_x, y,
+                                    transpose_y, T(1), out, T(0));
+  }
+};
+
+template <typename T>
+inline Tensor Reshape(const Tensor& input, const DDim& dims) {
+  Tensor output;
+  output.ShareDataWith(input);
+  output.Resize(dims);
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to (P * M) x N.
+// Identity op if the tensor is not of rank 3.
+template <typename T>
+Tensor CombineBatchAndM(const Tensor& input) {
+  Tensor output;
+  output.ShareDataWith(input);
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    std::vector<int64_t> out_dims = {in_dims[0] * in_dims[1], in_dims[2]};
+    output.Resize(make_ddim(out_dims));
+  }
+  return output;
+}
+
+// Reshape a rank-3 tensor from P x M x N to M x (P * N).
+// (Warning: This requires transposing data and writes into new memory.)
+// Identity op if the tensor is not of rank 3.
+template <typename Place, typename T>
+Tensor CombineBatchAndN(const framework::ExecutionContext& context,
+                        const Tensor& input) {
+  Tensor output;
+  auto in_dims = input.dims();
+  if (in_dims.size() == 3) {
+    output.Resize(in_dims);
+    output.mutable_data<T>(context.GetPlace());
+    EigenTranspose<Place, T, 3>(context, input, output, {1, 0, 2});
+    std::vector<int64_t> out_dims = {in_dims[1], in_dims[0] * in_dims[2]};
+    output.Resize(make_ddim(out_dims));
+  } else {
+    output.ShareDataWith(input);
+  }
+  return output;
+}
+
+// Using dimensional constraints on matrix multiplication, it is
+// straight-forward to check the following table for when X and Y
+// are both matrices.
+//
+// transpose_X | False    | True     | False    | True
+// transpose_Y | False    | False    | True     | True
+// -----------+----------+----------+----------+-----------
+//        dX = | dOut Y^T | Y dOut^T | dOut Y   | Y^T dOut^T
+//        dY = | X^T dOut | X dOut   | dOut^T X | dOut^T X^T
+//
+// When X is a vector of size K, we treat it instead as a matrix of shape
+// (1, K). Similarly, when Y is a vector of size K, we treat it instead as
+// a matrix of shape (K, 1).
+//
+// When X and Y are both 3-dimensional tensors, then the first dimension
+// the batch dimension can be ignored and the exact same formulas apply
+// as for two matrices.
+//
+// Finally, when, e.g., X is a 3-dimensional tensor but Y is a matrix, we end
+// up with formulas like
+//
+//   dY_{ij} = \sum_{p, m} X_{pmi} dOut_{pmj}
+//
+// To handle this sort of scenario, we reshape X : P x M x K, dOut: P x M x N
+// to X: (P * M) x K, dOut: (P * M) x N.
+template <typename Place, typename T>
+class MatMulGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor& x = *context.Input<Tensor>("X");
+    const Tensor& y = *context.Input<Tensor>("Y");
+    const Tensor& dout = *context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* dx = context.Output<Tensor>(framework::GradVarName("X"));
+    Tensor* dy = context.Output<Tensor>(framework::GradVarName("Y"));
+    bool transpose_x = context.Attr<bool>("transpose_X");
+    bool transpose_y = context.Attr<bool>("transpose_Y");
+
+    std::vector<int64_t> x_dims = vectorize(x.dims());
+    std::vector<int64_t> y_dims = vectorize(y.dims());
+
+    // If X is a vector, reshape it to a matrix.
+    if (x_dims.size() == 1) {
+      x_dims.insert(x_dims.begin(), 1);
+    }
+
+    // If Y is a vector, reshape it to a matrix.
+    if (y_dims.size() == 1) {
+      y_dims.push_back(1);
+    }
+
+    // Fix the dOut dimensions.
+    int M = 0, N = 0, batchCountX = 0, batchCountY = 0;
+
+    switch (x_dims.size()) {
+      case 2:
+        M = transpose_x ? x_dims[1] : x_dims[0];
+        break;
+      case 3:
+        batchCountX = x_dims[0];
+        M = transpose_x ? x_dims[2] : x_dims[1];
+        break;
+      default:
+        assert(false);
+    }
+
+    switch (y_dims.size()) {
+      case 2:
+        N = transpose_y ? y_dims[0] : y_dims[1];
+        break;
+      case 3:
+        batchCountY = y_dims[0];
+        N = transpose_y ? y_dims[1] : y_dims[2];
+        break;
+      default:
+        assert(false);
+    }
+    if (batchCountX && batchCountY) {
+      PADDLE_ENFORCE_EQ(
+          batchCountX, batchCountY,
+          "When Input(X) and Input(Y) are both three dimensional, they "
+          "must have the same batch dimension.");
+    }
+    int batchCount = std::max(batchCountX, batchCountY);
+    std::vector<int64_t> dout_dims = {M, N};
+    if (batchCount) {
+      dout_dims.insert(dout_dims.begin(), batchCount);
+    }
+    Tensor X = Reshape<T>(x, make_ddim(x_dims));
+    Tensor Y = Reshape<T>(y, make_ddim(y_dims));
+    Tensor dOut = Reshape<T>(dout, make_ddim(dout_dims));
+
+    if (dx) {
+      dx->mutable_data<T>(context.GetPlace());
+      const Tensor& dOut_for_dX =
+          (x_dims.size() == 2 && y_dims.size() == 3)
+              ? CombineBatchAndN<Place, T>(context, dOut)
+              : dOut;
+      if (x_dims.size() == 2 && y_dims.size() == 3) {
+        Y = transpose_y ? CombineBatchAndM<T>(Y)
+                        : CombineBatchAndN<Place, T>(context, Y);
+      }
+      if (transpose_x) {
+        math::MatMulFunctor<Place, T>()(context.device_context(), Y,
+                                        transpose_y, dOut_for_dX, transpose_x,
+                                        T(1), dx, T(0));
+      } else {
+        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dX,
+                                        transpose_x, Y, !transpose_y, T(1), dx,
+                                        T(0));
+      }
+    }
+
+    if (dy) {
+      dy->mutable_data<T>(context.GetPlace());
+      const Tensor& dOut_for_dY = (y_dims.size() == 2 && x_dims.size() == 3)
+                                      ? CombineBatchAndM<T>(dOut)
+                                      : dOut;
+      if (y_dims.size() == 2 && x_dims.size() == 3) {
+        X = transpose_x ? CombineBatchAndN<Place, T>(context, X)
+                        : CombineBatchAndM<T>(X);
+        dOut = CombineBatchAndM<T>(dOut);
+      }
+      if (transpose_y) {
+        math::MatMulFunctor<Place, T>()(context.device_context(), dOut_for_dY,
+                                        transpose_y, X, transpose_x, T(1), dy,
+                                        T(0));
+      } else {
+        math::MatMulFunctor<Place, T>()(context.device_context(), X,
+                                        !transpose_x, dOut_for_dY, transpose_y,
+                                        T(1), dy, T(0));
+      }
+    }
+  }
+};
+}  // namespace matmul_detail
+
+using matmul_detail::MatMulKernel;
+using matmul_detail::MatMulGradKernel;
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
index d799239d4ed6d230578c77921a1a454b476b63fa..9556fdf73151eeb947b4f1aee63e131ac6aa76e6 100644
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -21,8 +21,7 @@ class MeanOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of MeanOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -36,7 +35,7 @@ class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
   MeanOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The input of mean op");
-    AddOutput("Out", "The output of mean op").NotInGradient();
+    AddOutput("Out", "The output of mean op");
     AddComment(R"DOC( Mean Operator
 )DOC");
   }
@@ -46,17 +45,32 @@ class MeanGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
 
+class MeanGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* grad_op = new framework::OpDescBind();
+    grad_op->SetType("mean_grad");
+    grad_op->SetInput("X", Input("X"));
+    grad_op->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    grad_op->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker, mean_grad, ops::MeanGradOp);
+REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanGradMaker);
+REGISTER_OPERATOR(mean_grad, ops::MeanGradOp);
 REGISTER_OP_CPU_KERNEL(mean,
                        ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(mean_grad,
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
index ce31e178d8e375dc59be80a6c05133201308da70..c99286a5b928f1edcd845b01b21b95654c25db07 100644
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class MeanKernel : public framework::OpKernel {
+class MeanKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* input = context.Input<Tensor>("X");
@@ -45,7 +45,7 @@ class MeanKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MeanGradKernel : public framework::OpKernel {
+class MeanGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/minus_op.cc b/paddle/operators/minus_op.cc
index ce049d4d7bd96a6758d71b381e6e6b4edbcc8b5c..f7943e99acc5975d077f2319b6f678cfc693c1f3 100644
--- a/paddle/operators/minus_op.cc
+++ b/paddle/operators/minus_op.cc
@@ -25,8 +25,7 @@ class MinusOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of MinusOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"),
@@ -49,9 +48,9 @@ class MinusOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MinusOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left tensor of minus operator.").NotInGradient();
-    AddInput("Y", "The right tensor of minus operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of minus operator.").NotInGradient();
+    AddInput("X", "The left tensor of minus operator.");
+    AddInput("Y", "The right tensor of minus operator.");
+    AddOutput("Out", "The output tensor of minus operator.");
 
     AddComment(R"DOC(Minus Operator
 
@@ -64,26 +63,35 @@ or not. But the output only shares the LoD with input `X`.
 )DOC");
   }
 };
-template <typename AttrType>
-class MinusGradOp : public NetOp {
+
+class MinusGradMaker : public framework::GradOpDescMakerBase {
  public:
-  MinusGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    auto out_grad = Input(framework::GradVarName("Out"));
-    auto x_grad = Output(framework::GradVarName("X"));
-    auto y_grad = Output(framework::GradVarName("Y"));
-
-    // x_grad = out_grad
-    AppendOp(framework::OpRegistry::CreateOp("identity", {{"X", {out_grad}}},
-                                             {{"Y", {x_grad}}}, {}));
-
-    framework::AttributeMap scale_attr;
-    scale_attr["scale"] = static_cast<AttrType>(-1);
-    AppendOp(framework::OpRegistry::CreateOp("scale", {{"X", {out_grad}}},
-                                             {{"Out", {y_grad}}}, scale_attr));
-    CompleteAddOp(false);
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
+
+  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+      const override {
+    std::vector<std::unique_ptr<framework::OpDescBind>> ops;
+    auto x_g = InputGrad("X");
+    if (!x_g.empty()) {
+      auto *x_g_op = new framework::OpDescBind();
+      x_g_op->SetType("scale");
+      x_g_op->SetInput("X", OutputGrad("Out"));
+      x_g_op->SetOutput("Out", x_g);
+      x_g_op->SetAttr("scale", 1.0f);
+      ops.emplace_back(x_g_op);
+    }
+
+    auto y_g = InputGrad("Y");
+    if (!y_g.empty()) {
+      auto *y_g_op = new framework::OpDescBind();
+      y_g_op->SetType("scale");
+      y_g_op->SetInput("X", OutputGrad("Out"));
+      y_g_op->SetOutput("Out", y_g);
+      y_g_op->SetAttr("scale", -1.0f);
+      ops.emplace_back(y_g_op);
+    }
+
+    return ops;
   }
 };
 
@@ -91,7 +99,6 @@ class MinusGradOp : public NetOp {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(minus, ops::MinusOp, ops::MinusOpMaker, minus_grad,
-            ops::MinusGradOp<float>);
+REGISTER_OPERATOR(minus, ops::MinusOp, ops::MinusOpMaker, ops::MinusGradMaker);
 REGISTER_OP_CPU_KERNEL(minus,
                        ops::MinusKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/minus_op.h b/paddle/operators/minus_op.h
index 6310a4fd5141516cff4fc7acbe1d17913a1b5506..bd9a2790aa2b208c2d3dfc792031283eb6c42397 100644
--- a/paddle/operators/minus_op.h
+++ b/paddle/operators/minus_op.h
@@ -20,7 +20,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class MinusKernel : public framework::OpKernel {
+class MinusKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* left_tensor = context.Input<framework::Tensor>("X");
diff --git a/paddle/operators/modified_huber_loss_op.cc b/paddle/operators/modified_huber_loss_op.cc
index 84212a2b3be1ac3664ebd77c7a0ae4d86abad3a0..7b9e9528952d552a69ffe6a628672901c5c1a7fd 100644
--- a/paddle/operators/modified_huber_loss_op.cc
+++ b/paddle/operators/modified_huber_loss_op.cc
@@ -21,8 +21,7 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
 
@@ -73,8 +72,7 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("IntermediateVal"),
diff --git a/paddle/operators/modified_huber_loss_op.cu b/paddle/operators/modified_huber_loss_op.cu
index bce760f95e72cfec05b07591e0fa1250168b112f..8854e166cd99ce914d7f9f9bcead3234b0649506 100644
--- a/paddle/operators/modified_huber_loss_op.cu
+++ b/paddle/operators/modified_huber_loss_op.cu
@@ -39,7 +39,7 @@ struct ModifiedHuberLossBackward {
 };
 
 template <typename T>
-class ModifiedHuberLossGradGPUKernel : public framework::OpKernel {
+class ModifiedHuberLossGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("Y");
diff --git a/paddle/operators/modified_huber_loss_op.h b/paddle/operators/modified_huber_loss_op.h
index cb51007749e3c59572d4852959f4119ac377decc..aba75efad9c19e3e113b4f09bc1fbd4732f4e187 100644
--- a/paddle/operators/modified_huber_loss_op.h
+++ b/paddle/operators/modified_huber_loss_op.h
@@ -47,7 +47,7 @@ struct ModifiedHuberLossForward {
 };
 
 template <typename Place, typename T>
-class ModifiedHuberLossKernel : public framework::OpKernel {
+class ModifiedHuberLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
@@ -73,7 +73,7 @@ class ModifiedHuberLossKernel : public framework::OpKernel {
 
 // CPU backward kernel
 template <typename T>
-class ModifiedHuberLossGradCPUKernel : public framework::OpKernel {
+class ModifiedHuberLossGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("Y");
diff --git a/paddle/operators/momentum_op.cc b/paddle/operators/momentum_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d4d6f13720f0e6888edbddcb3243116506227ba
--- /dev/null
+++ b/paddle/operators/momentum_op.cc
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MomentumOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(param) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(grad) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Velocity"),
+                   "Input(velocity) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of Momentum should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of Momentum should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("VelocityOut"),
+                   "Output(VelocityOut) of Momentum should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and Grad input of MomentumOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Velocity"),
+        "Param and Velocity of MomentumOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(framework::product(ctx->GetInputDim("LearningRate")), 1,
+                      "Learning_rate should be a scalar");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("VelocityOut", param_dim);
+  }
+};
+
+class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MomentumOpMaker(framework::OpProto *proto,
+                  framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter that has to be updated");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Velocity",
+             "(Tensor, default Tensor<float>) "
+             "Input velocity (corresponding to the parameter) "
+             "that has to be updated");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "Input learning rate");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter");
+    AddOutput("VelocityOut", "(Tensor) Output updated velocity");
+
+    AddAttr<float>("mu", "(float) Momentum coefficient");
+    AddAttr<bool>("useNesterov", "(bool) Use Nesterov Momentum")
+        .SetDefault(false);
+    AddComment(R"DOC(
+
+Momentum Algorithm with a flag for Nestrov Moemntum (momentum).
+
+velocity = mu * velocity + gradient
+if (use_nesterov):
+  param = param - gradient * learning_rate + mu * velocity * learning_rate
+else:
+  param = param - learning_rate * velocity
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    momentum, ops::MomentumOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/momentum_op.cu b/paddle/operators/momentum_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..efc24e795e05951024009f0b3258769c352df344
--- /dev/null
+++ b/paddle/operators/momentum_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/momentum_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    momentum, ops::MomentumOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/momentum_op.h b/paddle/operators/momentum_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e6d6d1da3df9f7e43a93fcc2e12658a01a491f81
--- /dev/null
+++ b/paddle/operators/momentum_op.h
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MomentumOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto velocity_out = ctx.Output<framework::Tensor>("VelocityOut");
+    auto param = ctx.Input<framework::Tensor>("Param");
+    auto velocity = ctx.Input<framework::Tensor>("Velocity");
+    auto grad = ctx.Input<framework::Tensor>("Grad");
+    auto learning_rate = ctx.Input<framework::Tensor>("LearningRate");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    velocity_out->mutable_data<T>(ctx.GetPlace());
+
+    float mu = ctx.Attr<float>("mu");
+    bool use_nesterov = ctx.Attr<bool>("useNesterov");
+
+    auto p_out = framework::EigenVector<T>::Flatten(*param_out);
+    auto v_out = framework::EigenVector<T>::Flatten(*velocity_out);
+
+    auto p = framework::EigenVector<T>::Flatten(*param);
+    auto v = framework::EigenVector<T>::Flatten(*velocity);
+    auto g = framework::EigenVector<T>::Flatten(*grad);
+    auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+
+    auto place = ctx.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    v_out.device(place) = v * mu + g;
+    if (use_nesterov) {
+      p_out.device(place) = p - g * lr.broadcast(grad_dsize) +
+                            v_out * mu * lr.broadcast(grad_dsize);
+    } else {
+      p_out.device(place) = p - lr.broadcast(grad_dsize) * v_out;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index 9858c4d9c2195c7bd0e767aaa86a950e0a791443..b9b9cd7ca05b4373c27f672cc1ee20daab6827a8 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -1,16 +1,16 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
 
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
 
@@ -23,8 +23,7 @@ class MulOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) of MulOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -35,12 +34,14 @@ class MulOp : public framework::OperatorWithKernel {
     int x_num_col_dims = ctx->Attrs().Get<int>("x_num_col_dims");
     int y_num_col_dims = ctx->Attrs().Get<int>("y_num_col_dims");
 
-    PADDLE_ENFORCE(x_dims.size() > x_num_col_dims,
-                   "The rank of input tensor X should be larger than "
-                   "`mul_op`'s `x_num_col_dims`.");
-    PADDLE_ENFORCE(y_dims.size() > y_num_col_dims,
-                   "The rank of input tensor Y should be larger than "
-                   "`mul_op`'s `y_num_col_dims`.");
+    PADDLE_ENFORCE_GT(
+        x_dims.size(), x_num_col_dims,
+        "The input tensor X's rank of MulOp should be larger than "
+        "x_num_col_dims.");
+    PADDLE_ENFORCE_GT(
+        y_dims.size(), y_num_col_dims,
+        "The input tensor Y's rank of MulOp should be larger than "
+        "y_num_col_dims.");
 
     auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
     auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
@@ -48,7 +49,19 @@ class MulOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(
         x_mat_dims[1], y_mat_dims[0],
         "First matrix's width must be equal with second matrix's height.");
-    ctx->SetOutputDim("Out", {x_mat_dims[0], y_mat_dims[1]});
+    std::vector<int64_t> output_dims;
+    output_dims.reserve(
+        static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+    for (int i = 0; i < x_num_col_dims; ++i) {
+      output_dims.push_back(x_dims[i]);
+    }
+
+    for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+      output_dims.push_back(y_dims[i]);
+    }
+
+    ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
@@ -94,8 +107,7 @@ class MulOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
@@ -104,19 +116,10 @@ class MulOpGrad : public framework::OperatorWithKernel {
     auto y_dims = ctx->GetInputDim("Y");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
-    auto x_mat_dims =
-        framework::flatten_to_2d(x_dims, Attr<int>("x_num_col_dims"));
-    auto y_mat_dims =
-        framework::flatten_to_2d(y_dims, Attr<int>("y_num_col_dims"));
-
-    PADDLE_ENFORCE_EQ(
-        x_mat_dims[0], out_dims[0],
-        "The first dimension of Out@GRAD must equal to the first dimension of "
-        "the first operand.");
-    PADDLE_ENFORCE_EQ(
-        y_mat_dims[1], out_dims[1],
-        "The second dimension of Out@GRAD must equal to the second "
-        "dimension of the second operand.");
+    auto x_mat_dims = framework::flatten_to_2d(
+        x_dims, ctx->Attrs().Get<int>("x_num_col_dims"));
+    auto y_mat_dims = framework::flatten_to_2d(
+        y_dims, ctx->Attrs().Get<int>("y_num_col_dims"));
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index ac7136a76933d1f3ead86518c65d589747227631..bd1bdb4f81b88256822d663fe42ad314338c91ff 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class MulKernel : public framework::OpKernel {
+class MulKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* x = context.Input<Tensor>("X");
@@ -36,56 +36,69 @@ class MulKernel : public framework::OpKernel {
     Tensor* z = context.Output<Tensor>("Out");
     const Tensor x_matrix =
         x->dims().size() > 2
-            ? framework::ReshapeToMatrix<T>(
+            ? framework::ReshapeToMatrix(
                   *x, context.template Attr<int>("x_num_col_dims"))
             : *x;
     const Tensor y_matrix =
         y->dims().size() > 2
-            ? framework::ReshapeToMatrix<T>(
+            ? framework::ReshapeToMatrix(
                   *y, context.template Attr<int>("y_num_col_dims"))
             : *y;
 
     z->mutable_data<T>(context.GetPlace());
+    auto z_dim = z->dims();
+    if (z_dim.size() != 2) {
+      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+    }
     math::matmul<Place, T>(context.device_context(), x_matrix, false, y_matrix,
                            false, 1, z, 0);
+    if (z_dim.size() != 2) {
+      z->Resize(z_dim);
+    }
   }
 };
 
 template <typename Place, typename T>
-class MulGradKernel : public framework::OpKernel {
+class MulGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     int x_num_col_dims = ctx.template Attr<int>("x_num_col_dims");
     int y_num_col_dims = ctx.template Attr<int>("y_num_col_dims");
     const Tensor* x = ctx.Input<Tensor>("X");
     const Tensor* y = ctx.Input<Tensor>("Y");
-    const Tensor x_matrix =
-        x->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*x, x_num_col_dims)
-                             : *x;
-    const Tensor y_matrix =
-        y->dims().size() > 2 ? framework::ReshapeToMatrix<T>(*y, y_num_col_dims)
-                             : *y;
+    const Tensor x_matrix = x->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*x, x_num_col_dims)
+                                : *x;
+    const Tensor y_matrix = y->dims().size() > 2
+                                ? framework::ReshapeToMatrix(*y, y_num_col_dims)
+                                : *y;
     const Tensor* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
+    Tensor dout_mat;
+    dout_mat.ShareDataWith(*dout);
+    dout_mat.Resize({framework::flatten_to_2d(x->dims(), x_num_col_dims)[0],
+                     framework::flatten_to_2d(y->dims(), y_num_col_dims)[1]});
+
     Tensor* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     Tensor* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
-      Tensor dx_matrix = dx->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
-                                                     *dx, x_num_col_dims)
-                                               : *dx;
+      Tensor dx_matrix = dx->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dx, x_num_col_dims)
+                             : *dx;
+
       // dx = dout * y'. dx: M x K, dout : M x N, y : K x N
-      math::matmul<Place, T>(ctx.device_context(), *dout, false, y_matrix, true,
-                             1, &dx_matrix, 0);
+      math::matmul<Place, T>(ctx.device_context(), dout_mat, false, y_matrix,
+                             true, 1, &dx_matrix, 0);
     }
     if (dy) {
       dy->mutable_data<T>(ctx.GetPlace());
-      Tensor dy_matrix = dy->dims().size() > 2 ? framework::ReshapeToMatrix<T>(
-                                                     *dy, y_num_col_dims)
-                                               : *dy;
+      Tensor dy_matrix = dy->dims().size() > 2
+                             ? framework::ReshapeToMatrix(*dy, y_num_col_dims)
+                             : *dy;
       // dy = x' * dout. dy K x N, dout : M x N, x : M x K
-      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, *dout, false,
-                             1, &dy_matrix, 0);
+      math::matmul<Place, T>(ctx.device_context(), x_matrix, true, dout_mat,
+                             false, 1, &dy_matrix, 0);
     }
   }
 };
diff --git a/paddle/operators/multiplex_op.cc b/paddle/operators/multiplex_op.cc
index 9896d269ccc86d8fdc3bf6375e44ef5bf3e6b9c7..4d86769026e4b3e3040bdcb3bc6dc2edea58b4b0 100644
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -23,8 +23,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Ids"), "Input(Ids) shouldn't be null.");
     PADDLE_ENFORCE(!ctx->Inputs("X").empty(),
                    "MultiInput(X) shouldn't be empty.");
@@ -50,6 +49,12 @@ class MultiplexOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", in_dim);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+  }
 };
 
 class MultiplexOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -84,8 +89,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(!ctx->Inputs("X").empty(), "Input(X) should not be null.");
     PADDLE_ENFORCE(!ctx->Outputs(framework::GradVarName("X")).empty(),
                    "Output(X@Grad) should not be null.");
@@ -99,14 +103,21 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type());
+  }
 };
 
 }  // namespace operators
 }  // namespace paddle
 namespace ops = paddle::operators;
 
-REGISTER_OP(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker, multiplex_grad,
-            ops::MultiplexGradOp);
+REGISTER_OPERATOR(multiplex, ops::MultiplexOp, ops::MultiplexOpMaker,
+                  paddle::framework::DefaultGradOpDescMaker<false>);
+REGISTER_OPERATOR(multiplex_grad, ops::MultiplexGradOp);
 REGISTER_OP_CPU_KERNEL(
     multiplex, ops::MultiplexCPUKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 505776612e7119e568493506b113661a839e5bd1..143a14fef5783f8ed085d4c4ce2afb3b190d0600 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -21,7 +21,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 
 template <typename Place, typename T>
-class MultiplexGPUKernel : public framework::OpKernel {
+class MultiplexGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<Tensor>("X");
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace());
+    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
                       ctx.device_context())
@@ -51,7 +51,7 @@ class MultiplexGPUKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MultiplexGradGPUKernel : public framework::OpKernel {
+class MultiplexGradGPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<Tensor>(framework::GradVarName("Out"));
@@ -70,7 +70,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom<int32_t>(*ids, platform::CPUPlace());
+    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = reinterpret_cast<const platform::CUDADeviceContext&>(
diff --git a/paddle/operators/multiplex_op.h b/paddle/operators/multiplex_op.h
index 637c63a34af394f5f54997c46c00a9ff00577476..ab3cafaa324a29d6f249cf1f73db92e1364eebc8 100644
--- a/paddle/operators/multiplex_op.h
+++ b/paddle/operators/multiplex_op.h
@@ -23,7 +23,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class MultiplexCPUKernel : public framework::OpKernel {
+class MultiplexCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto ins = ctx.MultiInput<framework::Tensor>("X");
@@ -48,7 +48,7 @@ class MultiplexCPUKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class MultiplexGradCPUKernel : public framework::OpKernel {
+class MultiplexGradCPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
diff --git a/paddle/operators/name_convention.md b/paddle/operators/name_convention.md
index 379385dc5d914101c7b5c9494f9383b6cf6a9b79..5a216907950100070ba57176c382eb659effb293 100644
--- a/paddle/operators/name_convention.md
+++ b/paddle/operators/name_convention.md
@@ -11,7 +11,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
   - If an operator's Input/Output are tensors in math, not match to any meaningful words, input name should starts from `X`. e.g. `X`, `Y`, and output name should starts from `Out`. e.g. `Out`. This rule intends making operators which have few inputs/outputs unified.
 
 - Attribute.
-  - Attribute name follows the **camelCase**. e.g. `x`, `y`, `axis`, `rowwiseMatrix`. Also, attribute name prefers to meaningful English words.
+  - Attribute name follows the **snake_case**. e.g. `x`, `y`, `axis`, `rowwise_matrix`. Also, attribute name prefers to meaningful English words.
 
 - Comments.
   - Input/Output/Attr comment follow the format of **(type,default value) usage**, corresponding to which type it can be and how it will be used in the operator. e.g.  Attribute in Accumulator`"gamma" `,`(float, default 1.0) Accumulation multiplier`.
diff --git a/paddle/operators/net_op.h b/paddle/operators/net_op.h
index 2388b094d228562a4c9bfd1ad6840ef1c2068533..ebeb262d9621fa35c870b6407992f6b6d2bf7c70 100644
--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <set>
 #include "paddle/framework/framework.pb.h"
 #include "paddle/framework/op_registry.h"
 
diff --git a/paddle/operators/pad_op.cc b/paddle/operators/pad_op.cc
index 04ebb14f6ee6c73f48aa2f75811a22f9b8a25006..73a0b8baff530840ddd0d4c65cd4c060ab18e401 100644
--- a/paddle/operators/pad_op.cc
+++ b/paddle/operators/pad_op.cc
@@ -23,8 +23,7 @@ class PadOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of PadOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of PadOp should not be null.");
@@ -56,8 +55,7 @@ class PadOpMaker : public framework::OpProtoAndCheckerMaker {
              "The input should be a k-D tensor(k > 0 and k < 7)");
     AddOutput("Out",
               "The output of pad op."
-              "A tensor with the same shape as X.")
-        .NotInGradient();
+              "A tensor with the same shape as X.");
     AddComment(R"DOC(
 Pad input into output, as specified by paddings and pad_value. The input should be a k-D tensor(k > 0 and k < 7). As an example:
 
@@ -98,8 +96,7 @@ class PadOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
@@ -111,11 +108,29 @@ class PadOpGrad : public framework::OperatorWithKernel {
   }
 };
 
+class PadOpGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* bind = new framework::OpDescBind();
+    bind->SetInput("X", Input("X"));
+    bind->SetInput(framework::GradVarName("Out"), OutputGrad("Out"));
+    bind->SetOutput(framework::GradVarName("X"), InputGrad("X"));
+    bind->SetAttrMap(Attrs());
+    bind->SetType("pad_grad");
+    return std::unique_ptr<framework::OpDescBind>(bind);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(pad, ops::PadOp, ops::PadOpMaker, pad_grad, ops::PadOpGrad);
+
+REGISTER_OPERATOR(pad, ops::PadOp, ops::PadOpMaker, ops::PadOpGradMaker);
+REGISTER_OPERATOR(pad_grad, ops::PadOpGrad);
 REGISTER_OP_CPU_KERNEL(pad, ops::PadKernel<paddle::platform::CPUPlace, float>);
 REGISTER_OP_CPU_KERNEL(pad_grad,
                        ops::PadGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/pad_op.h b/paddle/operators/pad_op.h
index 2cc3b945ae5b2e2e93d8531c7f99e4c215d1d806..9534dbf54529e3b9ae2b6640d51fe291e9521927 100644
--- a/paddle/operators/pad_op.h
+++ b/paddle/operators/pad_op.h
@@ -47,7 +47,7 @@ void PadFunction(const framework::ExecutionContext& context) {
 }
 
 template <typename Place, typename T>
-class PadKernel : public framework::OpKernel {
+class PadKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
@@ -97,7 +97,7 @@ void PadGradFunction(const framework::ExecutionContext& context) {
 }
 
 template <typename Place, typename T>
-class PadGradKernel : public framework::OpKernel {
+class PadGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     size_t rank =
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a326839c0f9ad14b8fd2aac596f21c7dd2539cd7
--- /dev/null
+++ b/paddle/operators/pool_op.cc
@@ -0,0 +1,219 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace paddle {
+namespace operators {
+
+int OutputSizePool(int input_size, int filter_size, int padding, int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "X(Input) of Pooling should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                 "Out(Output) of Pooling should not be null.");
+
+  auto in_x_dims = ctx->GetInputDim("X");
+
+  std::string pooling_type = ctx->Attrs().Get<std::string>("pooling_type");
+  std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+  std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+  std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+  PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                 "Pooling intput should be 4-D or 5-D tensor.");
+
+  if (ctx->Attrs().Get<bool>("global_pooling")) {
+    ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+    for (size_t i = 0; i < ksize.size(); ++i)
+      ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+  }
+
+  PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                 "Input size and pooling size should be consistent.");
+  PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                    "Strides size and pooling size should be the same.");
+  PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                    "Paddings size and pooling size should be the same.");
+
+  std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+  for (size_t i = 0; i < ksize.size(); ++i) {
+    output_shape.push_back(
+        OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+  }
+  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+}
+
+void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                 "Input(X@GRAD) should not be null.");
+  ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+}
+
+Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "X",
+      "(Tensor) The input tensor of pooling operator. "
+      "The format of input tensor is NCHW. Where N is batch size, C is the "
+      "number of channels, H and W is the height and width of feature.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator."
+            "The format of output tensor is also NCHW."
+            "Where N is batch size, C is "
+            "the number of channels, H and W is the height and "
+            "width of feature.");
+
+  AddAttr<std::string>("pooling_type",
+                       "Pooling_type of pooling operator."
+                       "Str constant equal to 'max' or 'avg'.")
+      .InEnum({"max", "avg"});
+
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "The pooling window size(height, width) of pooling operator."
+      "If global_pooling = true, ksize is ignored and need not be "
+      "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                      // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "global_pooling",
+      "Whether to use the global_pooling."
+      "Bool constant equal to false or true."
+      "Default false."
+      "If global_pooling = true, ksize is ignored and need not be specified.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>("strides",
+                            "The strides(height, width) of pooling window."
+                            "Default {1,1}.")
+      .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                            // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>("paddings",
+                            "The zero padding(height, width) size on both sides"
+                            "Default {0,0}.")
+      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                            // TypedAttrChecker don't support vector type.)
+
+  AddComment(R"DOC(
+The pooling2d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCHW format. Where N is batch size, C is the
+number of channels, H and W is the height and width of feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, H_in, W_in)
+  Output:
+       Out shape: (N, C, H_out, W_out)
+       Mask shape: (N, C, H_out, W_out)
+  where
+       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+)DOC");
+}
+
+Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
+                             framework::OpAttrChecker *op_checker)
+    : OpProtoAndCheckerMaker(proto, op_checker) {
+  AddInput(
+      "X",
+      "(Tensor) The input tensor of pooling operator. "
+      "The format of input tensor is NCDHW. Where N is batch size, C is "
+      "the number of channels, D, H and W is the depth, height and width of "
+      "feature.");
+  AddOutput("Out",
+            "(Tensor) The output tensor of pooling operator."
+            "The format of output tensor is also NCDHW."
+            "Where N is batch size, C is "
+            "the number of channels, D, H and W is the depth, height and "
+            "width of feature.");
+
+  AddAttr<std::string>("pooling_type",
+                       "PoolingType of pooling operator."
+                       "Str constant equal to 'max' or 'avg'.")
+      .InEnum({"max", "avg"});
+
+  AddAttr<std::vector<int>>(
+      "ksize",
+      "The pooling window size(depth, height, width) of pooling operator."
+      "If global_pooling = true, ksize is ignored and need not be "
+      "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                      // TypedAttrChecker don't support vector type.)
+  AddAttr<bool>(
+      "global_pooling",
+      "Whether to use the global_pooling."
+      "Bool constant equal to false or true."
+      "Default false."
+      "If global_pooling = true, ksize is ignored and need not be specified.")
+      .SetDefault(false);
+  AddAttr<std::vector<int>>("strides",
+                            "Strides(depth, height, width) of pooling operator."
+                            "Default {1,1,1}.")
+      .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+  AddAttr<std::vector<int>>(
+      "paddings",
+      "Paddings(depth, height, width) of pooling operator."
+      "Default {0,0,0}.")
+      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                               // TypedAttrChecker don't support vector type.)
+
+  AddComment(R"DOC(
+The pooling3d operation calculates the output based on
+the input, poolingType and ksize, strides, paddings parameters.
+Input(X) and output(Out) are in NCDHW format. Where N is batch
+size, C is the number of channels, D, H and W is the depth, height and
+width of feature. Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, D_in, H_in, W_in)
+  Output:
+       Out shape: (N, C, D_out, H_out, W_out)
+       Mask shape: (N, C, D_out, H_out, W_out)
+  where
+       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+)DOC");
+}
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(pool2d, ops::PoolOp, ops::Pool2dOpMaker, pool2d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>)
+
+REGISTER_OP(pool3d, ops::PoolOp, ops::Pool3dOpMaker, pool3d_grad,
+            ops::PoolOpGrad);
+
+REGISTER_OP_CPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/pool_op.cu b/paddle/operators/pool_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e3b80868f7b9d1697d619889160856d65ad59a3
--- /dev/null
+++ b/paddle/operators/pool_op.cu
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(pool2d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool2d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
+
+REGISTER_OP_GPU_KERNEL(pool3d,
+                       ops::PoolKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(pool3d_grad,
+                       ops::PoolGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/pool_op.h b/paddle/operators/pool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ada956501918cc92a2d30ebb8d0c42453acd2839
--- /dev/null
+++ b/paddle/operators/pool_op.h
@@ -0,0 +1,173 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class PoolOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class PoolOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+};
+
+class Pool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool2dOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+class Pool3dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Pool3dOpMaker(framework::OpProto* proto,
+                framework::OpAttrChecker* op_checker);
+};
+
+template <typename Place, typename T>
+class PoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool2dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool2d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool2d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+      case 3: {
+        if (pooling_type == "max") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::MaxPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::MaxPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        } else if (pooling_type == "avg") {
+          paddle::operators::math::Pool3dFunctor<
+              Place, paddle::operators::math::AvgPool<T>, T>
+              pool3d_forward;
+          paddle::operators::math::AvgPool<T> pool_process;
+          pool3d_forward(context.device_context(), *in_x, *out, ksize, strides,
+                         paddings, pool_process);
+        }
+      } break;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class PoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    const Tensor* out = context.Input<Tensor>("Out");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::string pooling_type = context.Attr<std::string>("pooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+          temp.constant(static_cast<T>(0));
+
+      switch (ksize.size()) {
+        case 2: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool2dGradFunctor<Place, T>
+                pool2d_backward;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool2dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool2d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool2d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+        case 3: {
+          if (pooling_type == "max") {
+            paddle::operators::math::MaxPool3dGradFunctor<Place, T>
+                pool3d_backward;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings);
+          } else if (pooling_type == "avg") {
+            paddle::operators::math::Pool3dGradFunctor<
+                Place, paddle::operators::math::AvgPoolGrad<T>, T>
+                pool3d_backward;
+            paddle::operators::math::AvgPoolGrad<T> pool_process;
+            pool3d_backward(context.device_context(), *in_x, *in_x_grad, *out,
+                            *out_grad, ksize, strides, paddings, pool_process);
+          }
+        } break;
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..29d0322a27b71fe8d335703e228969c084f5139f
--- /dev/null
+++ b/paddle/operators/pool_with_index_op.cc
@@ -0,0 +1,251 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_with_index_op.h"
+
+namespace paddle {
+namespace operators {
+
+inline int OutputSizeMaxPool(int input_size, int filter_size, int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "X(Input) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Out(Output) of Pooling should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Mask"),
+                   "Mask(Output) of Pooling should not be null.");
+
+    auto in_x_dims = ctx->GetInputDim("X");
+
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
+    PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
+                   "Pooling intput should be 4-D or 5-D tensor.");
+
+    if (ctx->Attrs().Get<bool>("global_pooling")) {
+      ksize.resize(static_cast<size_t>(in_x_dims.size()) - 2);
+      for (size_t i = 0; i < ksize.size(); ++i)
+        ksize[i] = static_cast<int>(in_x_dims[i + 2]);
+    }
+
+    PADDLE_ENFORCE(in_x_dims.size() - ksize.size() == 2U,
+                   "Input size and pooling size should be consistent.");
+    PADDLE_ENFORCE_EQ(ksize.size(), strides.size(),
+                      "Strides size and pooling size should be the same.");
+    PADDLE_ENFORCE_EQ(ksize.size(), paddings.size(),
+                      "Paddings size and pooling size should be the same.");
+
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(OutputSizeMaxPool(in_x_dims[i + 2], ksize[i],
+                                               paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+    ctx->SetOutputDim("Mask", framework::make_ddim(output_shape));
+  }
+};
+
+class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Mask"), "Input(Mask) must not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool2dWithIndexOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of image.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of image.");
+    AddOutput("Mask",
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is the number of channels, H and W "
+              "is the height and width of image."
+              "The value in it is the index in current feature map");
+
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "The pooling window size(height, width) of pooling operator."
+        "If global_pooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                        // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "global_pooling",
+        "Whether to use the global_pooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If global_pooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>("strides",
+                              "The strides(height, width) of pooling window."
+                              "Default {1,1}.")
+        .SetDefault({1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                              // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "The zero padding(height, width) size on both sides"
+        "Default {0,0}.")
+        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                              // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+The maxPooling2d with index operation calculates the output and the mask
+based on the input and ksize, strides, paddings parameters. Input(X) and
+output(Out, Mask) are in NCHW format. Where N is batch size, C is the
+number of channels, H and W is the height and width of feature.
+Parameters(ksize, strides, paddings) are two elements.
+These two elements represent height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, H_in, W_in)
+  Output:
+       Out shape: (N, C, H_out, W_out)
+       Mask shape: (N, C, H_out, W_out)
+  where
+       H_out = (H_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       W_out = (W_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+)DOC");
+  }
+};
+
+class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxPool3dWithIndexOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of pooling operator. "
+        "The format of input tensor is NCDHW. Where N is batch size, C is "
+        "the number of channels, D, H and W is the depth, height and width of "
+        "image.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of pooling operator."
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is "
+              "the number of channels, D, H and W is the depth, height and "
+              "width of image.");
+    AddOutput("Mask",
+              "(Tensor) The Mask tensor of pooling operator."
+              "The format of output tensor is also NCDHW."
+              "Where N is batch size, C is the number of channels, D, H and W "
+              "is the depth, height and width of image."
+              "The value in it is the index in current feature map");
+
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "The pooling window size(depth, height, width) of pooling operator."
+        "If global_pooling = true, ksize is ignored and need not be "
+        "specified.");  // TODO(Chengduo): Add checker. (Currently,
+                        // TypedAttrChecker don't support vector type.)
+    AddAttr<bool>(
+        "global_pooling",
+        "Whether to use the global_pooling."
+        "Bool constant equal to false or true."
+        "Default false."
+        "If global_pooling = true, ksize is ignored and need not be specified.")
+        .SetDefault(false);
+    AddAttr<std::vector<int>>(
+        "strides",
+        "Strides(depth, height, width) of pooling operator."
+        "Default {1,1,1}.")
+        .SetDefault({1, 1, 1});  // TODO(Chengduo): Add checker. (Currently,
+                                 // TypedAttrChecker don't support vector type.)
+    AddAttr<std::vector<int>>(
+        "paddings",
+        "Paddings(depth, height, width) of pooling operator."
+        "Default {0,0,0}.")
+        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
+                                 // TypedAttrChecker don't support vector type.)
+
+    AddComment(R"DOC(
+The maxpooling3d with index operation calculates the output and the mask
+based on the input and ksize, strides, paddings parameters.
+Input(X) and output(Out, Mask) are in NCDHW format. Where N is batch
+size, C is the number of channels, D, H and W is the depth, height and
+width of feature. Parameters(ksize, strides, paddings) are three elements.
+These three elements represent depth, height and width, respectively.
+The input(X) size and output(Out, Mask) size may be different.
+
+Example:
+  Input:
+       X shape: (N, C, D_in, H_in, W_in)
+  Output:
+       Out shape: (N, C, D_out, H_out, W_out)
+       Mask shape: (N, C, D_out, H_out, W_out)
+  where
+       D_out = (D_in - ksize[0] + 2 * paddings[0]) / strides[0] + 1;
+       H_out = (H_in - ksize[1] + 2 * paddings[1]) / strides[1] + 1;
+       W_out = (W_in - ksize[2] + 2 * paddings[2]) / strides[2] + 1;
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(max_pool2d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool2dWithIndexOpMaker, max_pool2d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
+
+REGISTER_OP(max_pool3d_with_index, ops::MaxPoolWithIndexOp,
+            ops::MaxPool3dWithIndexOpMaker, max_pool3d_with_index_grad,
+            ops::MaxPoolWithIndexOpGrad);
+
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::CPUPlace, float>)
diff --git a/paddle/operators/pool_with_index_op.cu b/paddle/operators/pool_with_index_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..287657d4b1c57f354ef050885f71261092bdc062
--- /dev/null
+++ b/paddle/operators/pool_with_index_op.cu
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/pool_with_index_op.h"
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(
+    max_pool2d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    max_pool2d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
+
+REGISTER_OP_GPU_KERNEL(
+    max_pool3d_with_index,
+    ops::MaxPoolWithIndexKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    max_pool3d_with_index_grad,
+    ops::MaxPoolWithIndexGradKernel<paddle::platform::GPUPlace, float>)
diff --git a/paddle/operators/pool_with_index_op.h b/paddle/operators/pool_with_index_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..455c453efcd15bf0150bbd3de83d50729f338b4b
--- /dev/null
+++ b/paddle/operators/pool_with_index_op.h
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/pooling.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename Place, typename T>
+class MaxPoolWithIndexKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* in_x = context.Input<Tensor>("X");
+    Tensor* out = context.Output<Tensor>("Out");
+    Tensor* mask = context.Output<Tensor>("Mask");
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
+      }
+    }
+
+    switch (ksize.size()) {
+      case 2: {
+        paddle::operators::math::MaxPool2dWithIndexFunctor<Place, T>
+            pool2d_forward;
+        pool2d_forward(context.device_context(), *in_x, *out, *mask, ksize,
+                       strides, paddings);
+      } break;
+      case 3: {
+        paddle::operators::math::MaxPool3dWithIndexFunctor<Place, T>
+            pool3d_forward;
+        pool3d_forward(context.device_context(), *in_x, *out, *mask, ksize,
+                       strides, paddings);
+      } break;
+    }
+  }
+};
+
+template <typename Place, typename T>
+class MaxPoolWithIndexGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* mask = context.Input<Tensor>("Mask");
+    const Tensor* out_grad =
+        context.Input<Tensor>(framework::GradVarName("Out"));
+    Tensor* in_x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    if (context.Attr<bool>("global_pooling")) {
+      for (size_t i = 0; i < ksize.size(); ++i) {
+        ksize[i] = static_cast<int>(in_x_grad->dims()[i + 2]);
+      }
+    }
+
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      auto temp = framework::EigenVector<T>::Flatten(*in_x_grad);
+      temp.device(context.GetEigenDevice<Place>()) =
+          temp.constant(static_cast<T>(0));
+
+      switch (ksize.size()) {
+        case 2: {
+          paddle::operators::math::MaxPool2dWithIndexGradFunctor<Place, T>
+              pool2d_backward;
+          pool2d_backward(context.device_context(), *in_x_grad, *out_grad,
+                          *mask, ksize, strides, paddings);
+        } break;
+        case 3: {
+          paddle::operators::math::MaxPool3dWithIndexGradFunctor<Place, T>
+              pool3d_backward;
+          pool3d_backward(context.device_context(), *in_x_grad, *out_grad,
+                          *mask, ksize, strides, paddings);
+        } break;
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/prelu_op.cc b/paddle/operators/prelu_op.cc
index 1692464f2833a59243ccc1598422180262a59282..eef2e34eaacf59b9adacb343e9a0091ebabeaea3 100644
--- a/paddle/operators/prelu_op.cc
+++ b/paddle/operators/prelu_op.cc
@@ -25,8 +25,7 @@ class PReluOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
     PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
@@ -62,8 +61,7 @@ class PReluGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
diff --git a/paddle/operators/prelu_op.h b/paddle/operators/prelu_op.h
index 6b78ed295cbac060d816fb3dd27a4b80145cb1ce..5ad31c2203ae6c9bf6f48bb9ecf9a714597e7da8 100644
--- a/paddle/operators/prelu_op.h
+++ b/paddle/operators/prelu_op.h
@@ -40,7 +40,7 @@ class PReluFunctor {
 };
 
 template <typename Place, typename T>
-class PReluKernel : public framework::OpKernel {
+class PReluKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<Tensor>("X");
@@ -77,7 +77,7 @@ class PReluGradFunctor {
 };
 
 template <typename Place, typename T>
-class PReluGradKernel : public framework::OpKernel {
+class PReluGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
diff --git a/paddle/operators/proximal_gd_op.cc b/paddle/operators/proximal_gd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e4b014b9f5866ec0791cba9b3998b1734066eeeb
--- /dev/null
+++ b/paddle/operators/proximal_gd_op.cc
@@ -0,0 +1,93 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/proximal_gd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class ProximalGDOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of ProximalGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of ProximalGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of ProximalGDOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of ProximalGDOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Grad"),
+                      "Two input of ProximalGD Op's dimension must be same.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+  }
+};
+
+class ProximalGDOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  ProximalGDOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated.");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1.");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value.");
+
+    AddAttr<float>("l1",
+                   "(float, default 0.0) "
+                   "L1 regularization strength.")
+        .SetDefault(0.0f);
+    AddAttr<float>("l2",
+                   "(float, default 0.0)"
+                   "L2 regularization strength.")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+
+Optimizer that implements the proximal gradient descent algorithm.
+
+prox_param = param - learning_rate * grad
+param = sign(prox_param) / (1 + learning_rate * l2) *
+        max { |prox_param| - learning_rate * l1 , 0 }
+
+The paper that proposed Proximal Gradient Descent:
+(http://papers.nips.cc/paper/3793-efficient-learning-using-forward-backward-splitting.pdf)
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(proximal_gd, ops::ProximalGDOp,
+                             ops::ProximalGDOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/proximal_gd_op.cu b/paddle/operators/proximal_gd_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..26f4ebaa0f43620fee7ece2d71755be94a0e01a5
--- /dev/null
+++ b/paddle/operators/proximal_gd_op.cu
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+You may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed
+under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
+CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/proximal_gd_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    proximal_gd, ops::ProximalGDOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/proximal_gd_op.h b/paddle/operators/proximal_gd_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bebda0204173ec5c3ec9a7a9da6fb623171f4cea
--- /dev/null
+++ b/paddle/operators/proximal_gd_op.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class ProximalGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
+    auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    auto prox_param = p - lr.broadcast(grad_dsize) * g;
+    if (l1 > 0) {
+      p_out.device(place) =
+          prox_param.sign() *
+          (((prox_param.abs() - (lr * l1).broadcast(grad_dsize))
+                .cwiseMax(T(0.0))) /
+           (1.0 + (lr * l2).broadcast(grad_dsize)));
+    } else {
+      p_out.device(place) =
+          prox_param / (1.0 + (lr * l2).broadcast(grad_dsize));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 1ba22006f27abc963e7f161636a964863513a40c..17ef2b1d01bd37abf2ece97ed0a307c2f1bf7e6f 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -24,8 +24,7 @@ class RankLossOp : public framework::OperatorWithKernel {
              const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null");
     PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null");
@@ -89,8 +88,7 @@ class RankLossGradOp : public framework::OperatorWithKernel {
                  const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("Left"), "Input(Left) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput("Right"), "Input(Right) shouldn't be null.");
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index 7df195ff47ecfd79388385eed4bd37b8c9b45979..f184d6efcb496a1d7f38540712b6c431f816482e 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class RankLossKernel : public framework::OpKernel {
+class RankLossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out_t = ctx.Output<framework::Tensor>("Out");
@@ -42,7 +42,7 @@ class RankLossKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class RankLossGradKernel : public framework::OpKernel {
+class RankLossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_left_t =
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index 80de229c333f645fb3098b97fa076c6b77bb7ca9..40303e3adf4db7e8336ed72667fe69afa56c3f69 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -30,36 +30,40 @@ using LoDTensor = framework::LoDTensor;
 
 void RecurrentAlgorithm::Run(const Scope& scope,
                              const platform::DeviceContext& dev_ctx) const {
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     false /*infer_shape_mode*/);
-  InitMemories(step_scopes[0], false /*infer_shape_mode*/);
+  auto* input0 = scope.FindVar(arg_->inlinks[0]);
+  PADDLE_ENFORCE_NOT_NULL(input0);
+  size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
+  PADDLE_ENFORCE_GT(seq_len, 0);
 
-  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
-    // create output alias variables
+  CreateScopes(scope, seq_len);
+  auto& step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
+  InitMemories(step_scopes[0]);
+
+  for (size_t step_id = 0; step_id < seq_len; step_id++) {
     if (step_id > 0) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1,
-                        false /*infer_shape_mode*/);
+      rnn::LinkMemories(step_scopes, arg_->states, step_id, -1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx);
 }
 
-void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
+void RecurrentAlgorithm::CreateScopes(const Scope& scope,
+                                      size_t seq_len) const {
   // TODO(superjom) Only two scopes are needed for inference, this case will be
   // supported later.
-  auto step_scopes_var = scope.FindVar(arg_->step_scopes);
+  auto* step_scopes_var = scope.FindVar(arg_->step_scopes);
   PADDLE_ENFORCE(step_scopes_var != nullptr, "");
-  auto step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
+  auto* step_scopes = step_scopes_var->GetMutable<std::vector<Scope*>>();
 
   // Now all variables in scope must be created outside of op.
   PADDLE_ENFORCE_NOT_NULL(stepnet_);
-  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(), "stepnet_ op has no outputs");
+  PADDLE_ENFORCE(!(*stepnet_)->Outputs().empty(),
+                 "step_unit_ op has no outputs");
 
-  if (seq_len_ > step_scopes->size()) {
-    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
+  if (seq_len > step_scopes->size()) {
+    for (size_t i = step_scopes->size(); i < seq_len; ++i) {
       auto& step_scope = scope.NewScope();
 
       // create step net's temp inputs
@@ -67,14 +71,14 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
         // the weight are located in parent scope
         for (auto& var_name : input.second) {
           if (!step_scope.FindVar(var_name)) {
-            step_scope.NewVar(var_name)->GetMutable<LoDTensor>();
+            step_scope.Var(var_name)->GetMutable<LoDTensor>();
           }
         }
       }
       // create stepnet's outputs
       for (const auto& output : (*stepnet_)->Outputs()) {
         for (auto& var_name : output.second) {
-          step_scope.NewVar(var_name);
+          step_scope.Var(var_name);
         }
       }
       step_scopes->emplace_back(&step_scope);
@@ -82,31 +86,27 @@ void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
   }
 }
 
-void RecurrentAlgorithm::InitMemories(Scope* step_scope,
-                                      bool infer_shape_mode) const {
-  for (auto& attr : arg_->memories) {
-    auto* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<LoDTensor>();
+void RecurrentAlgorithm::InitMemories(Scope* step_scope) const {
+  for (auto& attr : arg_->states) {
+    auto* pre_mem = step_scope->Var(attr.pre_var)->GetMutable<LoDTensor>();
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "memory [%s]'s boot variable [%s] not exists", attr.var,
                    attr.boot_var);
     auto* boot_mem =
         step_scope->FindVar(attr.boot_var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      pre_mem->Resize(boot_mem->dims());
-      PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
-    } else {
-      pre_mem->ShareDataWith<float>(*boot_mem);
-    }
+    pre_mem->Resize(boot_mem->dims());
+    PADDLE_ENFORCE_EQ(pre_mem->dims().size(), 2);
+    pre_mem->ShareDataWith(*boot_mem);
   }
 }
 
 const rnn::ArgumentName RecurrentOp::kArgName{
-    "step_net", "step_scopes",  "inlinks",      "outlinks",
-    "memories", "pre_memories", "boot_memories"};
+    "step_net", "step_scopes", "inputs",        "outputs",
+    "states",   "ex_states",   "initial_states"};
 
 const rnn::ArgumentName RecurrentGradientOp::kArgName{
-    "step_net", "step_scopes@GRAD", "outlinks@GRAD",     "inlinks@GRAD",
-    "memories", "pre_memories",     "boot_memories@GRAD"};
+    "step_net", "step_scopes@GRAD", "outputs@GRAD",       "inputs@GRAD",
+    "states",   "ex_states",        "initial_states@GRAD"};
 
 RecurrentOp::RecurrentOp(const std::string& type,
                          const framework::VariableNameMap& inputs,
@@ -128,7 +128,7 @@ class RecurrentAlgorithmProtoAndCheckerMaker
     AddInput(name.inlinks,
              "the inputs that need to be segmented for each step.")
         .AsDuplicable();
-    AddInput(name.boot_memories, "variables to initialize memories.")
+    AddInput(name.initial_states, "variables to initialize states.")
         .AsDuplicable();
 
     AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
@@ -136,9 +136,8 @@ class RecurrentAlgorithmProtoAndCheckerMaker
     AddOutput(name.step_scopes, "step scopes");
 
     // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.pre_memories,
-                                      "names of pre-memories");
-    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
+    AddAttr<std::vector<std::string>>(name.ex_states, "names of pre-states");
+    AddAttr<std::vector<std::string>>(name.states, "names of states");
 
     AddComment("This is a recurrent group operator.");
   }
@@ -146,36 +145,33 @@ class RecurrentAlgorithmProtoAndCheckerMaker
 
 void RecurrentGradientAlgorithm::Run(
     const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  auto step_scopes = GetStepScopes(scope);
-  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_,
-                     false /*infer_shape_mode*/);
-  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
-    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
-      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1,
-                        false /*infer_shape_mode*/);
+  auto* input0 = scope.FindVar(arg_->inlinks[0]);
+  PADDLE_ENFORCE_NOT_NULL(input0);
+  size_t seq_len = input0->GetMutable<LoDTensor>()->dims()[0];
+  auto& step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len);
+  for (int step_id = seq_len - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len - 1) {
+      rnn::LinkMemories(step_scopes, arg_->states, step_id, 1);
     }
     (*stepnet_)->Run(*step_scopes[step_id], dev_ctx);
   }
-  LinkBootMemoryGradients(step_scopes[0], false);
-  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_,
-                     false /*infer_shape_mode*/);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len, dev_ctx);
+  LinkBootMemoryGradients(step_scopes[0]);
 }
 
 void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
-    Scope* step_scope, bool infer_shape_mode) const {
-  for (auto& attr : arg_->memories) {
+    Scope* step_scope) const {
+  for (auto& attr : arg_->states) {
     PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
                    "memory variable [%s] does not exists", attr.var);
     PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
                    "boot variable [%s] does not exists", attr.boot_var);
-    auto* mem_grad = step_scope->NewVar(attr.var)->GetMutable<LoDTensor>();
+    auto* mem_grad = step_scope->Var(attr.var)->GetMutable<LoDTensor>();
     auto* boot_mem_grad =
-        step_scope->NewVar(attr.boot_var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      boot_mem_grad->Resize(mem_grad->dims());
-    } else {
-      boot_mem_grad->ShareDataWith<float>(*mem_grad);
-    }
+        step_scope->Var(attr.boot_var)->GetMutable<LoDTensor>();
+    boot_mem_grad->Resize(mem_grad->dims());
+    boot_mem_grad->ShareDataWith(*mem_grad);
   }
 }
 
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
index c6b9a5533eece9057449b5c875ddcb3cefe716f0..253d7e3284360ceaddce9ef5f8f9a3ea4793d740 100644
--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -48,7 +48,7 @@ class RecurrentAlgorithm {
    * NOTE the scopes are reused in both the forward and backward, so just
    * create once and expand its size if more steps need.
    */
-  void CreateScopes(const framework::Scope& scope) const;
+  void CreateScopes(const framework::Scope& scope, size_t seq_len) const;
 
   const std::vector<framework::Scope*>& GetStepScopes(
       const framework::Scope& scope) const {
@@ -56,12 +56,11 @@ class RecurrentAlgorithm {
                 ->GetMutable<std::vector<framework::Scope*>>();
   }
 
-  void InitMemories(framework::Scope* step_scopes, bool infer_shape_mode) const;
+  void InitMemories(framework::Scope* step_scopes) const;
 
  private:
   std::unique_ptr<framework::OperatorBase>* stepnet_;
   rnn::Argument* arg_;
-  mutable size_t seq_len_;
 };
 
 class RecurrentGradientAlgorithm {
@@ -86,8 +85,7 @@ class RecurrentGradientAlgorithm {
   void Run(const framework::Scope& scope,
            const platform::DeviceContext& dev_ctx) const;
 
-  void LinkBootMemoryGradients(framework::Scope* step_scopes,
-                               bool infer_shape_mode) const;
+  void LinkBootMemoryGradients(framework::Scope* step_scopes) const;
 
  protected:
   inline const std::vector<framework::Scope*>& GetStepScopes(
@@ -98,7 +96,6 @@ class RecurrentGradientAlgorithm {
 
  private:
   rnn::Argument* arg_;
-  mutable size_t seq_len_;
   std::unique_ptr<framework::OperatorBase>* stepnet_;
 };
 
@@ -123,6 +120,7 @@ class RecurrentOp : public framework::OperatorBase {
   void set_stepnet(std::unique_ptr<OperatorBase> net) {
     stepnet_ = std::move(net);
   }
+
   const OperatorBase& stepnet() const { return *stepnet_; }
 
   static const rnn::ArgumentName kArgName;
diff --git a/paddle/operators/reduce_op.cc b/paddle/operators/reduce_op.cc
index 3ef443d1c7f475cbd578078db02fb5e0d500d060..0599daa7688a5658ebea8902c4e15e63570539fb 100644
--- a/paddle/operators/reduce_op.cc
+++ b/paddle/operators/reduce_op.cc
@@ -13,6 +13,7 @@
    limitations under the License. */
 
 #include "paddle/operators/reduce_op.h"
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -23,8 +24,7 @@ class ReduceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ReduceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -57,8 +57,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null.");
@@ -168,36 +167,22 @@ namespace ops = paddle::operators;
 
 REGISTER_OP(reduce_sum, ops::ReduceOp, ops::ReduceSumOpMaker, reduce_sum_grad,
             ops::ReduceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    reduce_sum,
-    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::SumFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_sum_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
-                                             ops::SumGradFunctor>);
 
 REGISTER_OP(reduce_mean, ops::ReduceOp, ops::ReduceMeanOpMaker,
             reduce_mean_grad, ops::ReduceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    reduce_mean,
-    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::MeanFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_mean_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
-                                             ops::MeanGradFunctor>);
 
 REGISTER_OP(reduce_max, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_max_grad,
             ops::ReduceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    reduce_max,
-    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::MaxFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_max_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
-                                             ops::MaxOrMinGradFunctor>);
-
-REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMaxOpMaker, reduce_min_grad,
+
+REGISTER_OP(reduce_min, ops::ReduceOp, ops::ReduceMinOpMaker, reduce_min_grad,
             ops::ReduceGradOp);
-REGISTER_OP_CPU_KERNEL(
-    reduce_min,
-    ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::MinFunctor>);
-REGISTER_OP_CPU_KERNEL(reduce_min_grad,
-                       ops::ReduceGradKernel<paddle::platform::CPUPlace, float,
-                                             ops::MaxOrMinGradFunctor>);
+
+#define REGISTER_REDUCE_CPU_KERNEL(reduce_type, functor, grad_functor)     \
+  REGISTER_OP_CPU_KERNEL(                                                  \
+      reduce_type,                                                         \
+      ops::ReduceKernel<paddle::platform::CPUPlace, float, ops::functor>); \
+  REGISTER_OP_CPU_KERNEL(reduce_type##_grad,                               \
+                         ops::ReduceGradKernel<paddle::platform::CPUPlace, \
+                                               float, ops::grad_functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_CPU_KERNEL);
diff --git a/paddle/operators/reduce_op.cu b/paddle/operators/reduce_op.cu
index 595127b858ea8eb41281f92e92c6467e4d90ff1a..d306e1a24096d737438d71d4d4abc35328d160cb 100644
--- a/paddle/operators/reduce_op.cu
+++ b/paddle/operators/reduce_op.cu
@@ -17,30 +17,12 @@
 
 namespace ops = paddle::operators;
 
-REGISTER_OP_GPU_KERNEL(
-    reduce_sum,
-    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::SumFunctor>);
-REGISTER_OP_GPU_KERNEL(reduce_sum_grad,
-                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
-                                             ops::SumGradFunctor>);
-
-REGISTER_OP_GPU_KERNEL(
-    reduce_mean,
-    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::MeanFunctor>);
-REGISTER_OP_GPU_KERNEL(reduce_mean_grad,
-                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
-                                             ops::MeanGradFunctor>);
-
-REGISTER_OP_GPU_KERNEL(
-    reduce_max,
-    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::MaxFunctor>);
-REGISTER_OP_GPU_KERNEL(reduce_max_grad,
-                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
-                                             ops::MaxOrMinGradFunctor>);
-
-REGISTER_OP_GPU_KERNEL(
-    reduce_min,
-    ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::MinFunctor>);
-REGISTER_OP_GPU_KERNEL(reduce_min_grad,
-                       ops::ReduceGradKernel<paddle::platform::GPUPlace, float,
-                                             ops::MaxOrMinGradFunctor>);
+#define REGISTER_REDUCE_GPU_KERNEL(reduce_type, functor, grad_functor)     \
+  REGISTER_OP_GPU_KERNEL(                                                  \
+      reduce_type,                                                         \
+      ops::ReduceKernel<paddle::platform::GPUPlace, float, ops::functor>); \
+  REGISTER_OP_GPU_KERNEL(reduce_type##_grad,                               \
+                         ops::ReduceGradKernel<paddle::platform::GPUPlace, \
+                                               float, ops::grad_functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_REDUCE_GPU_KERNEL);
diff --git a/paddle/operators/reduce_op.h b/paddle/operators/reduce_op.h
index 2fbf94e34f3961a9b3140fb682a7c479f3b71f4d..45043c440bc8017e97f8be00d08f1cb60d201e20 100644
--- a/paddle/operators/reduce_op.h
+++ b/paddle/operators/reduce_op.h
@@ -87,7 +87,7 @@ struct MaxOrMinGradFunctor {
 };
 
 template <typename Place, typename T, typename Functor>
-class ReduceKernel : public framework::OpKernel {
+class ReduceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
@@ -141,7 +141,7 @@ class ReduceKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T, typename Functor>
-class ReduceGradKernel : public framework::OpKernel {
+class ReduceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     int rank = context.Input<Tensor>("X")->dims().size();
@@ -198,3 +198,9 @@ class ReduceGradKernel : public framework::OpKernel {
 
 }  // namespace operators
 }  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro)                \
+  __macro(reduce_sum, SumFunctor, SumGradFunctor);      \
+  __macro(reduce_mean, MeanFunctor, MeanGradFunctor);   \
+  __macro(reduce_max, MaxFunctor, MaxOrMinGradFunctor); \
+  __macro(reduce_min, MinFunctor, MaxOrMinGradFunctor);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index a3c3fa2716ad9f6487e3eff2d98b2c76d964ddef..a8eb8d45eec214842ee756a260127b9d0aacb0f4 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -25,8 +25,7 @@ class ReshapeOp : public framework::OperatorWithKernel {
             const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     // input check
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ReshapeOp should not be null.");
@@ -93,8 +92,7 @@ class ReshapeGradOp : public framework::OperatorWithKernel {
                 const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) shouldn't be null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shouldn't be null.");
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index 873acf30782d390cdca5e7e864c76e1f743f9a7c..c89cdf8cab9f209667c5e09b521b8f6e30f202fd 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -21,7 +21,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class ReshapeKernel : public framework::OpKernel {
+class ReshapeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* out = ctx.Output<framework::Tensor>("Out");
@@ -33,13 +33,13 @@ class ReshapeKernel : public framework::OpKernel {
     std::transform(shape.begin(), shape.end(), shape_int64.begin(),
                    [](int a) { return static_cast<int64_t>(a); });
     auto out_dims = framework::make_ddim(shape_int64);
-    out->CopyFrom<T>(*in, ctx.GetPlace());
+    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
     out->Resize(out_dims);
   }
 };
 
 template <typename Place, typename T>
-class ReshapeGradKernel : public framework::OpKernel {
+class ReshapeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const {
     auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
@@ -47,7 +47,7 @@ class ReshapeGradKernel : public framework::OpKernel {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    d_x->CopyFrom<T>(*d_out, ctx.GetPlace());
+    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/rmsprop_op.cc b/paddle/operators/rmsprop_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fd5567a365c4c843de3b8aec7fa77164f16644a4
--- /dev/null
+++ b/paddle/operators/rmsprop_op.cc
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/rmsprop_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RmspropOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("MeanSquare"),
+                   "Input(MeanSquare) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Moment"),
+                   "Input(Moment) of RmspropOp should not be null.");
+
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(param_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MomentOut"),
+                   "Output(Momentum_out) of RmspropOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("MeanSquareOut"),
+                   "Output(MeanSquareOut) of RmspropOp should not be null.");
+
+    auto param_dim = ctx->GetInputDim("Param");
+    PADDLE_ENFORCE_EQ(
+        param_dim, ctx->GetInputDim("Grad"),
+        "Param and grad input of RmspropOp should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("Moment"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("MeanSquare"),
+                      "Param and Momentum input of RmspropOp "
+                      "should have the same dimension.");
+
+    auto lr_dim = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dim), 1,
+                      "Learning Rate should be a scalar.");
+
+    ctx->SetOutputDim("ParamOut", param_dim);
+    ctx->SetOutputDim("MomentOut", param_dim);
+    ctx->SetOutputDim("MeanSquareOut", param_dim);
+  }
+};
+
+class RmspropOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RmspropOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Param",
+             "(Tensor, default Tensor<float>) "
+             "Input parameter value that has to be updated");
+    AddInput("MeanSquare",
+             "(Tensor, default Tensor<float>)"
+             " The mean square value that gets updated");
+    AddInput("LearningRate",
+             "(Tensor, default Tensor<float>) "
+             "The learning rate should be a tensor of size 1");
+    AddInput("Grad",
+             "(Tensor, default Tensor<float>) "
+             "Input gradient of the parameter");
+    AddInput("Moment",
+             "(Tensor, default Tensor<float>) The moment that gets updated");
+
+    AddOutput("ParamOut", "(Tensor) Output updated parameter value");
+    AddOutput("MomentOut", "(Tensor) Output updated moment");
+    AddOutput("MeanSquareOut", "(Tensor) Output Mean squared updated value");
+
+    AddAttr<float>("epsilon",
+                   "(float, default 1e-10) Constant "
+                   "for numerical stability.")
+        .SetDefault(1.0e-10f);
+    AddAttr<float>("decay",
+                   "(float, default 0.9) "
+                   "Discounting factor for coming gradient.")
+        .SetDefault(0.9f);
+    AddAttr<float>("momentum", "(float, default 0.0) Constant value")
+        .SetDefault(0.0f);
+    AddComment(R"DOC(
+
+RMSprop
+
+MeanSquareOut = decay * MeanSquare + (1 - decay) * Grad * Grad
+MomentOut = momentum * Moment +
+            LearningRate * Grad / sqrt(MeanSquareOut + epsilon)
+ParamOut = Param -  MomentOut
+
+The original slides that proposed RMSprop: Slide 29 of
+http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf)
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(rmsprop, ops::RmspropOp, ops::RmspropOpMaker);
+REGISTER_OP_CPU_KERNEL(rmsprop,
+                       ops::RmspropOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rmsprop_op.cu b/paddle/operators/rmsprop_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..52634a54816bcd5ad0ba82a56f1df95110112265
--- /dev/null
+++ b/paddle/operators/rmsprop_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/rmsprop_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(rmsprop,
+                       ops::RmspropOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/rmsprop_op.h b/paddle/operators/rmsprop_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..7bf2129010f994966d79ef11d5cec30159b47068
--- /dev/null
+++ b/paddle/operators/rmsprop_op.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T>
+class RmspropOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* param_out = ctx.Output<Tensor>("ParamOut");
+    auto* moment_out = ctx.Output<Tensor>("MomentOut");
+    auto* mean_square_out = ctx.Output<Tensor>("MeanSquareOut");
+
+    auto grad = ctx.Input<Tensor>("Grad");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+    moment_out->mutable_data<T>(ctx.GetPlace());
+    mean_square_out->mutable_data<T>(ctx.GetPlace());
+
+    float epsilon = ctx.Attr<float>("epsilon");
+    float rho = ctx.Attr<float>("decay");
+    float momentum = ctx.Attr<float>("momentum");
+
+    auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
+    auto ms = EigenVector<T>::Flatten(*ctx.Input<Tensor>("MeanSquare"));
+    auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto mom = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Moment"));
+
+    auto p_out = EigenVector<T>::Flatten(*param_out);
+    auto mom_out = EigenVector<T>::Flatten(*moment_out);
+    auto ms_out = EigenVector<T>::Flatten(*mean_square_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+
+    ms_out.device(place) = rho * ms + (1 - rho) * g * g;
+    mom_out.device(place) =
+        momentum * mom +
+        lr.broadcast(grad_dsize) * g / (ms_out + epsilon).sqrt();
+    p_out.device(place) = p - mom_out;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
index a767009d2366e20d2ebd35f562b8df7d408f2d4e..ee61ea300c33722471189d06eb09f67a083d2a4d 100644
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -25,7 +25,7 @@ using LoDTensor = framework::LoDTensor;
 
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& inlinks,
-                   const size_t seq_len, bool infer_shape_mode) {
+                   const size_t seq_len) {
   PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
   for (size_t i = 0; i < inlinks.size(); ++i) {
     // global inputs
@@ -36,16 +36,14 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
     LoDTensor* input = input_var->GetMutable<LoDTensor>();
     f::DDim dims = input->dims();
     PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inlinks be the same length");
+                      "all the inputs be the same length");
     f::DDim step_dims = slice_ddim(dims, 1, dims.size());
     for (size_t j = 0; j < seq_len; j++) {
       Tensor* step_input =
-          step_scopes[j]->NewVar(inlinks[i])->GetMutable<Tensor>();
-      if (!infer_shape_mode) {
-        // The input of operators of each step is Tensor here.
-        // Maybe need to modify Slice function.
-        *step_input = input->Slice<float>(j, j + 1);
-      }
+          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
+      // The input of operators of each step is Tensor here.
+      // Maybe need to modify Slice function.
+      *step_input = input->Slice(j, j + 1);
       step_input->Resize(step_dims);
     }
   }
@@ -53,39 +51,35 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
 
 void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
-                   const size_t seq_len, bool infer_shape_mode) {
+                   const size_t seq_len, const platform::DeviceContext& ctx) {
   for (size_t i = 0; i < outlinks.size(); i++) {
-    auto output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
+    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
     PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
                             outlinks[i]);
     LoDTensor* output = output_var->GetMutable<LoDTensor>();
 
-    if (infer_shape_mode) {
-      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-      PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-      f::DDim step_dims =
-          step_scope_var->template GetMutable<LoDTensor>()->dims();
-      std::vector<int64_t> dims_vec = vectorize(step_dims);
-      dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(f::make_ddim(dims_vec));
-    } else {
-      output->mutable_data<float>(platform::CPUPlace());
-      for (size_t j = 0; j < seq_len; j++) {
-        LoDTensor* step_output =
-            step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-        // TODO(luotao02) data type and platform::DeviceContext() should set
-        // correctly
-        (output->Slice<float>(j, j + 1))
-            .CopyFrom<float>(*step_output, platform::CPUPlace());
-      }
+    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
+    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
+    f::DDim step_dims =
+        step_scope_var->template GetMutable<LoDTensor>()->dims();
+    std::vector<int64_t> dims_vec = vectorize(step_dims);
+    dims_vec.insert(dims_vec.begin(), seq_len);
+    output->Resize(f::make_ddim(dims_vec));
+    output->mutable_data<float>(platform::CPUPlace());
+    for (size_t j = 0; j < seq_len; j++) {
+      LoDTensor* step_output =
+          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
+      // TODO(luotao02) data type and platform::DeviceContext() should set
+      // correctly
+      (output->Slice(j, j + 1))
+          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
     }
   }
 }
 
 void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::MemoryAttr>& memories,
-                  const size_t step_id, const int offset,
-                  bool infer_shape_mode) {
+                  const std::vector<rnn::StateAttr>& memories,
+                  const size_t step_id, const int offset) {
   PADDLE_ENFORCE_LT(step_id, scopes.size(),
                     "step [%d] is out of range of step scopes' size [%d]",
                     step_id, scopes.size());
@@ -95,16 +89,13 @@ void LinkMemories(const std::vector<Scope*>& scopes,
       step_id + offset, scopes.size(),
       "offset [%d] is out of range, it must be less than (%d - %d)", offset,
       scopes.size(), step_id);
-  auto scope = scopes[step_id];
-  auto linked_scope = scopes[step_id + offset];
+  auto* scope = scopes[step_id];
+  auto* linked_scope = scopes[step_id + offset];
   for (auto& attr : memories) {
-    auto mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    if (infer_shape_mode) {
-      mem->Resize(linked_mem->dims());
-    } else {
-      mem->ShareDataWith<float>(*linked_mem);
-    }
+    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
+    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
+    mem->Resize(linked_mem->dims());
+    mem->ShareDataWith(*linked_mem);
   }
 }
 
@@ -115,26 +106,26 @@ void InitArgument(const ArgumentName& name, Argument* arg,
   arg->inlinks = op.Inputs(name.inlinks);
   arg->outlinks = op.Outputs(name.outlinks);
 
-  auto boot_memories =
-      is_grad ? op.Outputs(name.boot_memories) : op.Inputs(name.boot_memories);
+  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
+                                : op.Inputs(name.initial_states);
   // attributes
-  auto memories = op.Attr<std::vector<std::string>>(name.memories);
-  auto pre_memories = op.Attr<std::vector<std::string>>(name.pre_memories);
+  auto& memories = op.Attr<std::vector<std::string>>(name.states);
+  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
 
   PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of memories, boot_memories don't match:%d,%d",
+                 "the size of states, initial_states don't match:%d,%d",
                  memories.size(), boot_memories.size());
   PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 "the size of ex_states, initial_states don't match:%d,%d",
                  pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
 
   for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::MemoryAttr mem_attr;
+    rnn::StateAttr mem_attr;
     mem_attr.var = memories[i];
     mem_attr.pre_var = pre_memories[i];
     mem_attr.boot_var = boot_memories[i];
-    (arg->memories).push_back(mem_attr);
+    (arg->states).push_back(mem_attr);
   }
 }
 
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
index 9c777f1e9067a3e2ceb9d23f7bf7d3c73343c91f..fb0e158e07745d58c6211d33e385b324e492b95e 100644
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -31,7 +31,7 @@ using Scope = framework::Scope;
  * boot memories in father scope. Other attributes are copied from Op's proto
  * attributes.
  */
-struct MemoryAttr {
+struct StateAttr {
   // name of current state variable
   std::string var;
   // name of previous step's state variable
@@ -46,7 +46,7 @@ struct Argument {
   std::string step_scopes;
   std::vector<std::string> inlinks;
   std::vector<std::string> outlinks;
-  std::vector<rnn::MemoryAttr> memories;
+  std::vector<rnn::StateAttr> states;
 };
 
 struct ArgumentName {
@@ -54,9 +54,9 @@ struct ArgumentName {
   std::string step_scopes;
   std::string inlinks;
   std::string outlinks;
-  std::string memories;       // the memory name
-  std::string pre_memories;   // the previous memory name
-  std::string boot_memories;  // the boot memory name
+  std::string states;          // the memory name
+  std::string ex_states;       // the previous memory name
+  std::string initial_states;  // the boot memory name
 };
 
 /**
@@ -64,18 +64,18 @@ struct ArgumentName {
  */
 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& inlinks,
-                   const size_t seq_len, bool infer_shape_mode);
+                   const size_t seq_len);
 
 /**
  * Process outputs of step nets and merge to variables.
  */
 void ConcatOutputs(const std::vector<Scope*>& step_scopes,
                    const std::vector<std::string>& outlinks,
-                   const size_t seq_len, bool infer_shape_mode);
+                   const size_t seq_len, const platform::DeviceContext& ctx);
 
 void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<MemoryAttr>& memories, const size_t step_id,
-                  const int offset, bool infer_shape_mode);
+                  const std::vector<StateAttr>& memories, const size_t step_id,
+                  const int offset);
 
 void InitArgument(const ArgumentName& name, Argument* arg,
                   const framework::OperatorBase& op, bool is_grad = false);
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
deleted file mode 100644
index 1fcf0959dffd6a68d97dec4e2b5b509d06c0d09c..0000000000000000000000000000000000000000
--- a/paddle/operators/rowwise_add_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/rowwise_add_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-class RowwiseAddOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("b"),
-                   "Input(b) of RowwiseAddOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of RowwiseAddOp should not be null.");
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto b_dims = ctx->GetInputDim("b");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), b_dims.size(),
-        "The rank of input `X` must be larger than the one of input `b`.");
-
-    int num_col_dims = x_dims.size() - b_dims.size();
-
-    PADDLE_ENFORCE_EQ(
-        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
-        "The width of two operands must be same");
-    PADDLE_ENFORCE_EQ(ctx->Outputs("Out").size(), 1,
-                      "The output size must be 1");
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-};
-
-class RowwiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  RowwiseAddOpMaker(framework::OpProto* proto,
-                    framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The left input of row-wise add op, must be matrix");
-    AddInput("b", "The right input of row-wise add op, must be vector");
-    AddOutput("Out", "The output of row-wise add op");
-    AddComment(R"DOC(Row-wise Add operator
-
-for i in xrange(X.shape[0]):
-  Out = X[i] + b
-)DOC");
-  }
-};
-class RowwiseAddGradOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X should not be null");
-    PADDLE_ENFORCE(ctx->HasInput("b"), "b should not be null");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input(Out@GRAD) should not be null");
-    auto x_dims = ctx->GetInputDim("X");
-    auto b_dims = ctx->GetInputDim("b");
-    PADDLE_ENFORCE_GT(
-        x_dims.size(), b_dims.size(),
-        "The rank of input `X` must be larger than the one of input `b`.");
-
-    int64_t num_col_dims = x_dims.size() - b_dims.size();
-    PADDLE_ENFORCE_EQ(
-        framework::slice_ddim(x_dims, num_col_dims, x_dims.size()), b_dims,
-        "The width of two operands must be same");
-    auto x_grad_name = framework::GradVarName("X");
-    auto b_grad_name = framework::GradVarName("b");
-    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
-    }
-    if (ctx->HasOutput(b_grad_name)) {
-      ctx->SetOutputDim(b_grad_name, b_dims);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP(rowwise_add, ops::RowwiseAddOp, ops::RowwiseAddOpMaker,
-            rowwise_add_grad, ops::RowwiseAddGradOp);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add, ops::RowwiseAddKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add_grad,
-    ops::RowwiseAddGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
deleted file mode 100644
index 35774b940926f77167b8f19597027e74d3477e5b..0000000000000000000000000000000000000000
--- a/paddle/operators/rowwise_add_op.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-
-template <typename Place, typename T>
-class RowwiseAddKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto out = context.Output<Tensor>("Out");
-    out->mutable_data<T>(context.GetPlace());
-    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-                       context.Input<Tensor>("b")->dims().size();
-    auto input =
-        EigenMatrix<T>::Reshape(*context.Input<Tensor>("X"), num_col_dims);
-    auto bias = EigenVector<T>::Flatten(*context.Input<Tensor>("b"));
-    auto output = EigenMatrix<T>::Reshape(*out, num_col_dims);
-
-    const int bias_size = bias.dimension(0);
-    const int rest_size = input.size() / bias_size;
-    Eigen::DSizes<int, 1> one_d(input.size());
-    Eigen::DSizes<int, 1> bcast(rest_size);
-    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
-        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
-  }
-};
-
-template <typename Place, typename T>
-class RowwiseAddGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* db = context.Output<Tensor>(framework::GradVarName("b"));
-    int num_col_dims = context.Input<Tensor>("X")->dims().size() -
-                       context.Input<Tensor>("b")->dims().size();
-
-    auto out_grad = EigenMatrix<T>::Reshape(*dout, num_col_dims);
-    auto place = context.GetEigenDevice<Place>();
-
-    if (dx) {
-      dx->mutable_data<T>(context.GetPlace());
-      EigenMatrix<T>::Reshape(*dx, num_col_dims).device(place) = out_grad;
-    }
-
-    if (db) {
-      db->mutable_data<T>(context.GetPlace());
-      // https://eigen.tuxfamily.org/dox/unsupported/TensorBase_8h_source.html
-      // colwise add
-      Eigen::array<int, 1> dims{{0}}; /* dimension to reduce */
-      EigenVector<T>::Flatten(*db).device(place) = out_grad.sum(dims);
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/save_restore_op.cc b/paddle/operators/save_restore_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..314e4e927924bf0442b7afe0184bf344e24c1521
--- /dev/null
+++ b/paddle/operators/save_restore_op.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+#include <fstream>
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+using framework::LoDTensor;
+
+inline static std::string VarToFileName(const std::string& folder_path,
+                                        const std::string& var_name) {
+  return folder_path + "/__" + var_name + "__";
+}
+
+class SaveOp : public framework::OperatorBase {
+ public:
+  SaveOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    const auto& var_names = this->Inputs("X");
+    for (const auto& name : var_names) {
+      PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
+                              "Can not find variable '%s' in the scope.", name);
+    }
+    std::string folder_path = this->Attr<std::string>("folderPath");
+    PADDLE_ENFORCE(!folder_path.empty(),
+                   "'folderPath' of SaveOp shouldn't be empty.");
+
+    VLOG(1) << "Save variables to folder: " << folder_path;
+    for (const auto& name : var_names) {
+      std::string file_name = VarToFileName(folder_path, name);
+      std::ofstream fout(file_name, std::ofstream::out);
+      PADDLE_ENFORCE(fout.is_open(), "Fail to create file %s.", file_name);
+      const LoDTensor& tensor = scope.FindVar(name)->Get<LoDTensor>();
+      std::string bytes = tensor.SerializeToString();
+      fout << bytes;
+      fout.close();
+    }
+    VLOG(1) << "Compelete saving variables. Items count: " << var_names.size();
+  }
+};
+
+class SaveOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SaveOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(tensor), the tensor count can be 1~INT_MAX, tensors names which "
+             "values will be saved.")
+        .AsDuplicable();
+    AddAttr<std::string>("folderPath", "the folderPath for save model.");
+    AddComment(R"DOC(
+Save the input tensors to a binary file based on input tensor names and absolute path.
+
+All the inputs can carry the LoD (Level of Details) information,
+or not.
+)DOC");
+  }
+};
+
+class RestoreOp : public framework::OperatorBase {
+ public:
+  RestoreOp(const std::string& type, const framework::VariableNameMap& inputs,
+            const framework::VariableNameMap& outputs,
+            const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    const auto& var_names = this->Outputs("Out");
+    for (const auto& name : var_names) {
+      PADDLE_ENFORCE_NOT_NULL(scope.FindVar(name),
+                              "Can not find variable '%s' in the scope.", name);
+    }
+    std::string folder_path = this->Attr<std::string>("folderPath");
+    PADDLE_ENFORCE(!folder_path.empty(),
+                   "'folderPath' of RestoreOp shouldn't be empty.");
+
+    VLOG(1) << "Try loading variables from folder: " << folder_path;
+
+    for (const auto& name : var_names) {
+      std::string file_name = VarToFileName(folder_path, name);
+      std::ifstream fin(file_name, std::ifstream::in);
+      PADDLE_ENFORCE(fin.is_open(), "Fail to open file %s.", file_name);
+      const size_t kBufferSize = 4096;  // equal to linux page size
+      char buffer[kBufferSize];
+      std::string cache;
+      while (!fin.eof()) {
+        fin.read(buffer, kBufferSize);
+        cache.append(buffer, fin.gcount());
+      }
+      LoDTensor* tensor = scope.FindVar(name)->GetMutable<LoDTensor>();
+      tensor->DeserializeFromString(cache, dev_ctx.GetPlace());
+      fin.close();
+    }
+    VLOG(1) << "Complete loading variables.";
+  }
+};
+
+class RestoreOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RestoreOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out",
+              "(tensor), the tensor count can be 1~INT_MAX, tensors which "
+              "values will be restores.")
+        .AsDuplicable();
+    AddAttr<std::string>("folderPath", "the folderPath for model file.");
+    AddAttr<int>("data_type", "output tensor data type")
+        .SetDefault(framework::DataType::FP32);
+    AddComment(R"DOC(
+Restore the tensors from model file based on absolute path.
+
+All the tensors outputs may carry the LoD (Level of Details) information,
+or not.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(save, paddle::operators::SaveOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::SaveOpMaker);
+
+REGISTER_OPERATOR(restore, paddle::operators::RestoreOp,
+                  paddle::framework::EmptyGradOpMaker,
+                  paddle::operators::RestoreOpMaker);
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index e92501e12834b92875f494de401672344f50e3b5..7f1a21bea72992307a05d50e7a0600ee763dd813 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -25,8 +25,7 @@ class ScaleOp : public framework::OperatorWithKernel {
           const framework::AttributeMap &attrs)
       : OperatorWithKernel(type, inputs, outputs, attrs) {}
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of ScaleOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
@@ -41,8 +40,8 @@ class ScaleOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ScaleOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "The input tensor of scale operator.").NotInGradient();
-    AddOutput("Out", "The output tensor of scale operator.").NotInGradient();
+    AddInput("X", "The input tensor of scale operator.");
+    AddOutput("Out", "The output tensor of scale operator.");
     AddComment(R"DOC(Scale operator
 
 The equation is: Out = scale*X
@@ -52,21 +51,17 @@ The equation is: Out = scale*X
   }
 };
 
-// The operator to calculate gradients of a scale operator is just the scale
-// operator itself.
-// Grad(Out=scale(X)) => Grad(X) = scale(Grad(Out))
-template <typename AttrType>
-class ScaleGradOp : public NetOp {
+class ScaleGradMaker : public framework::SingleGradOpDescMaker {
  public:
-  ScaleGradOp(const std::string &type, const framework::VariableNameMap &inputs,
-              const framework::VariableNameMap &outputs,
-              const framework::AttributeMap &attrs)
-      : NetOp(type, inputs, outputs, attrs) {
-    AppendOp(framework::OpRegistry::CreateOp(
-        "scale", {{"X", {Input(framework::GradVarName("Out"))}}},
-        {{"Out", {Output(framework::GradVarName("X"))}}},
-        {{"scale", Attr<AttrType>("scale")}}));
-    CompleteAddOp(false);
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto *grad_op = new framework::OpDescBind();
+    grad_op->SetType("scale");
+    grad_op->SetInput("X", OutputGrad("Out"));
+    grad_op->SetOutput("Out", InputGrad("X"));
+    grad_op->SetAttr("scale", GetAttr("scale"));
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
   }
 };
 
@@ -75,7 +70,7 @@ class ScaleGradOp : public NetOp {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(scale, ops::ScaleOp, ops::ScaleOpMaker<float>, scale_grad,
-            ops::ScaleGradOp<float>);
+REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
+                  ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/scale_op.h b/paddle/operators/scale_op.h
index 02fbdc52bbf89c9f2acc5eeaa1197e4ccbca9d31..dc6bc768997f4fdd049bb63bdc11252ab52fcda9 100644
--- a/paddle/operators/scale_op.h
+++ b/paddle/operators/scale_op.h
@@ -20,7 +20,7 @@
 namespace paddle {
 namespace operators {
 template <typename Place, typename T, typename AttrType = T>
-class ScaleKernel : public framework::OpKernel {
+class ScaleKernel : public framework::OpKernel<T> {
  public:
   virtual void Compute(const framework::ExecutionContext& context) const {
     auto* tensor = context.Output<framework::Tensor>("Out");
diff --git a/paddle/operators/scatter.cu.h b/paddle/operators/scatter.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..d95436be4f25b9df4aaef57ddb249ecf944f0666
--- /dev/null
+++ b/paddle/operators/scatter.cu.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void ScatterCUDAKernel(const T* params, const int* indices,
+                                  T* output, size_t index_size,
+                                  size_t slice_size) {
+  CUDA_1D_KERNEL_LOOP(i, index_size * slice_size) {
+    int indices_i = i / slice_size;
+    int slice_i = i - indices_i * slice_size;  // offset inside the slice
+    int scatter_i = indices[indices_i];
+    int out_i = scatter_i * slice_size + slice_i;
+    *(output + out_i) = *(params + i);
+  }
+}
+
+/**
+ * A thin wrapper on gpu tensor
+ * Return a new updated tensor from source tensor, scatter-assigned according to
+ * index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void GPUScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                      const Tensor& index, Tensor* output) {
+  // PADDLE_ENFORCE(platform::is_gpu_place(place));
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
+
+  auto src_dims = src.dims();
+  framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
+  int block = 512;
+  int n = slice_size * index_size;
+  int grid = (n + block - 1) / block;
+
+  ScatterCUDAKernel<T><<<
+      grid, block, 0,
+      reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream()>>>(
+      p_src, p_index, p_output, index_size, slice_size);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/scatter.h b/paddle/operators/scatter.h
index 6b542675c291607b35f180123cf42fee6a783a85..c1fb844ebd2ff7ca7dbdb8e8ac3c1fff4c0c6607 100644
--- a/paddle/operators/scatter.h
+++ b/paddle/operators/scatter.h
@@ -24,67 +24,42 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-// Implementation of CPU copy
-template <typename T>
-void CPUScatterUpdate(const paddle::framework::Tensor* src, const int* index,
-                      const size_t index_size,
-                      paddle::framework::Tensor* output) {
-  paddle::framework::DDim output_dims = output->dims();
-
-  for (size_t i = 0; i < index_size; ++i) {
-    int index_ = index[i];
-
-    paddle::framework::Tensor src_ = *src;
-    paddle::framework::Tensor output_ = *output;
-    if (index_size > 1) src_ = src->Slice<T>(i, i + 1);
-    if (output_dims[0] > 1) output_ = output->Slice<T>(index_, index_ + 1);
-
-    auto X = EigenVector<T>::Flatten(src_);
-    auto Y = EigenVector<T>::Flatten(output_);
-
-    Y = X + Y;
-  }
-}
-
-// Implementation of GPU scatter:
-template <typename T>
-void GPUScatterUpdate(const T* src, const int* index, const int slice_size,
-                      const int index_size, T* output);
 
 /**
  * Return a updated tensor from source tensor, scattered according to index:
- * dst[i] += src[index[i]]
+ * dst[i] = src[index[i]]
  * input[src]: type-T source Tensor
  * input[index]: type-int index Tensor (1-D)
  * return: output tensor
  */
 template <typename T>
-void ScatterUpdate(const platform::Place& place,
-                   const paddle::framework::Tensor* src,
-                   const paddle::framework::Tensor* index,
-                   paddle::framework::Tensor* output) {
+void ScatterAssign(const platform::DeviceContext& ctx, const Tensor& src,
+                   const Tensor& index, Tensor* output) {
+  PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()));
   // check index of shape 1-D
-  PADDLE_ENFORCE(index->dims().size() == 1);
-  int index_size = index->dims()[0];
+  PADDLE_ENFORCE(index.dims().size() == 1);
+  int index_size = index.dims()[0];
 
-  auto src_dims = src->dims();
+  auto src_dims = src.dims();
   auto dst_dims = output->dims();
 
+  const T* p_src = src.data<T>();
+  const int* p_index = index.data<int>();
+  T* p_output = output->data<T>();
+
   // check src shape and dst shape should match
   for (int i = 1; i < src_dims.size(); i++)
     PADDLE_ENFORCE(src_dims[i] == dst_dims[i]);
 
   // slice size
   size_t slice_size = 1;
-  for (int i = 0; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+  for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  const size_t slice_bytes = slice_size * sizeof(T);
 
-  if (platform::is_cpu_place(place)) {
-    CPUScatterUpdate<T>(src, index->data<int>(), index_size, output);
-  } else {
+  for (int i = 0; i < index_size; ++i) {
+    int index_ = p_index[i];
+    memcpy(p_output + index_ * slice_size, p_src + i * slice_size, slice_bytes);
   }
 }
 
diff --git a/paddle/operators/scatter_op.cc b/paddle/operators/scatter_op.cc
index 3fc4a39ebc5526bfed61ba667c3cdc214cdd056c..62e6c70b4513fdfab1c563b6b23f36292fb6486a 100644
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
@@ -22,8 +22,7 @@ class ScatterOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Ref"),
                    "Input(Ref) of ScatterOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Index"),
@@ -48,18 +47,29 @@ class ScatterOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", ref_dims);
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+  }
 };
 
 class ScatterGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     ctx->SetOutputDim(framework::GradVarName("Updates"),
                       ctx->GetInputDim("Updates"));
     ctx->SetOutputDim(framework::GradVarName("Ref"), ctx->GetInputDim("Ref"));
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Ref")->type());
+  }
 };
 
 class ScatterOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -87,8 +97,5 @@ Out[Index] = Ref[Index] + Updates
 namespace ops = paddle::operators;
 REGISTER_OP(scatter, ops::ScatterOp, ops::ScatterOpMaker, scatter_grad,
             ops::ScatterGradOp);
-REGISTER_OP_CPU_KERNEL(scatter,
-                       ops::ScatterOpKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(
-    scatter_grad,
-    ops::ScatterGradientOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(scatter, ops::ScatterOpKernel<float>);
+REGISTER_OP_CPU_KERNEL(scatter_grad, ops::ScatterGradientOpKernel<float>);
diff --git a/paddle/operators/scatter_op.cu b/paddle/operators/scatter_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3b32ae2fb77a5d3d4c558742ec469c74d15eee07
--- /dev/null
+++ b/paddle/operators/scatter_op.cu
@@ -0,0 +1,63 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "gather.cu.h"
+#include "paddle/operators/gather_op.h"
+#include "scatter.cu.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class ScatterOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *Ref = ctx.Input<Tensor>("Ref");
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *Updates = ctx.Input<Tensor>("Updates");
+    auto *Out = ctx.Output<Tensor>("Out");
+
+    Out->ShareDataWith(*Ref);
+
+    GPUScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
+  }
+};
+
+template <typename T>
+class ScatterGradOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
+                   "This kernel only runs on GPU device.");
+    auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
+    auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
+    auto *Index = ctx.Input<Tensor>("Index");
+    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
+
+    // In place gradient: dRef = dO
+    dRef->ShareDataWith(*dOut);
+    dUpdates->mutable_data<T>(ctx.GetPlace());
+    // Gradient by Gather: dUpdates = dO[Index]
+    GPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(scatter, ops::ScatterOpCUDAKernel<float>);
+REGISTER_OP_GPU_KERNEL(scatter_grad, ops::ScatterGradOpCUDAKernel<float>);
diff --git a/paddle/operators/scatter_op.h b/paddle/operators/scatter_op.h
index e9595638a86a4a4536ddad4e6f20fd80a54b1608..1a4f6f99bfe36cd0de2d4f2af3f6054571d8f188 100644
--- a/paddle/operators/scatter_op.h
+++ b/paddle/operators/scatter_op.h
@@ -23,36 +23,40 @@ namespace operators {
 
 using Tensor = framework::Tensor;
 
-template <typename Place, typename T>
-class ScatterOpKernel : public framework::OpKernel {
+template <typename T>
+class ScatterOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
     auto *Ref = ctx.Input<Tensor>("Ref");
     auto *Index = ctx.Input<Tensor>("Index");
     auto *Updates = ctx.Input<Tensor>("Updates");
     auto *Out = ctx.Output<Tensor>("Out");
 
     // In place output: Out = Ref, Out[Index] += Updates
-    Out->ShareDataWith<T>(*Ref);
+    Out->ShareDataWith(*Ref);
     // Apply ScatterUpdate: Out[index] += Updates[:]
-    ScatterUpdate<T>(ctx.GetPlace(), Updates, Index, Out);
+    ScatterAssign<T>(ctx.device_context(), *Updates, *Index, Out);
   }
 };
 
-template <typename Place, typename T>
-class ScatterGradientOpKernel : public framework::OpKernel {
+template <typename T>
+class ScatterGradientOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "This kernel only runs on CPU.");
     auto *dRef = ctx.Output<Tensor>(framework::GradVarName("Ref"));
     auto *dUpdates = ctx.Output<Tensor>(framework::GradVarName("Updates"));
     auto *Index = ctx.Input<Tensor>("Index");
     auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
 
     // In place gradient: dRef = dO
-    dRef->ShareDataWith<T>(*dOut);
+    dRef->ShareDataWith(*dOut);
     dUpdates->mutable_data<T>(ctx.GetPlace());
     // Gradient by Gather: dUpdates += dO[Index]
-    Gather<T>(ctx.GetPlace(), dOut, Index, dUpdates);
+    CPUGather<T>(ctx.device_context(), *dOut, *Index, dUpdates);
   }
 };
 
diff --git a/paddle/operators/scatter_test.cc b/paddle/operators/scatter_test.cc
index 26fdaff1460a297fa638181641991f732533fe52..00dbdacbfef7af826790472acc6caa285c259e0e 100644
--- a/paddle/operators/scatter_test.cc
+++ b/paddle/operators/scatter_test.cc
@@ -40,7 +40,9 @@ TEST(scatter, ScatterUpdate) {
 
   float* p_output = output->mutable_data<float>(make_ddim({4, 4}), CPUPlace());
 
-  ScatterUpdate<float>(CPUPlace(), src, index, output);
+  auto* cpu_place = new paddle::platform::CPUPlace();
+  paddle::platform::CPUDeviceContext ctx(*cpu_place);
+  ScatterAssign<float>(ctx, *src, *index, output);
 
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], float(0));
   for (size_t i = 0; i < 4; ++i) EXPECT_EQ(output->data<float>()[i], float(0));
diff --git a/paddle/operators/sequence_concat_op.cc b/paddle/operators/sequence_concat_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1fce96cdfe20fc3ab33a3cd00e9a03833c9b94f8
--- /dev/null
+++ b/paddle/operators/sequence_concat_op.cc
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_concat_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceConcatOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"),
+                   "Inputs(X) of SequenceConcatOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceConcatOp should not be null.");
+    const size_t level = static_cast<size_t>(ctx->Attrs().Get<int>("level"));
+    const size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
+    PADDLE_ENFORCE(level == 0UL || level == 1UL,
+                   "The sequence_concat operator only accepts sequence "
+                   "or a nested sequence as its input.");
+    auto ins_dims = ctx->GetInputsDim("X");
+    framework::DDim out_dims = ins_dims[0];
+    const size_t n = ins_dims.size();
+    for (size_t i = 1; i < n; ++i) {
+      out_dims[axis] += ins_dims[i][axis];
+    }
+    ctx->SetOutputDim("Out", out_dims);
+  }
+};
+
+class SequenceConcatOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceConcatOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(A vector of LoDTensor), the input is a vector of LoDTensor, "
+             "each of which is a variable-length sequence or nested sequence.")
+        .AsDuplicable();
+    AddOutput("Out",
+              "(A LoDTensor), the variable-length output of "
+              "sequence_concat Op.");
+    AddAttr<int>("axis",
+                 "(int, default 0)"
+                 "The axis which the inputs will be joined with. "
+                 "If axis is 0, the inputs will be joined with LoD index.")
+        .SetDefault(0);
+    AddAttr<int>("level",
+                 "(int, default 0)"
+                 "The level at which the inputs will be joined. "
+                 "If the level is 0, the inputs will be joined at the nested "
+                 "sequence level. "
+                 "If the level is 1, the inputs will be joined at the "
+                 "sequence level. "
+                 "The level should be less than the level number of inputs.")
+        .SetDefault(0);
+    AddComment(R"DOC(
+    The sequence_concat operator concatenates multiple LoDTensors. 
+    It only supports sequence (LoD Tensor with level number is 1) 
+    or a nested sequence (LoD tensor with level number is 2) as its input.
+    - Case1:
+      If the axis is other than 0(here, axis is 1 and level is 1),
+      each input should have the same LoD information and the LoD 
+      information of the output keeps the same as the input.
+
+      LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+      LoD(x1) = {{0,2,4}, {0,1,2,3,4}}; Dims(x1) = (4,4,4)
+      LoD(Out) = {{0,2,4}, {0,1,2,3,4}}; Dims(Out) = (4,7,4)
+
+    - Case2:
+      If the axis is 0(here, leve is 0), the inputs are concatenated along 
+      time steps, the LoD information of the output need to re-compute.
+
+      LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+      LoD(x1) = {{0,3,5}, {0,1,2,3,5}}; Dims(x1) = (5,3,4)
+      LoD(Out) = {{0,5,9}, {0,1,2,3,4,5,6,7,9}}; Dims(Out) = (9,3,4)
+
+    - Case3:
+      If the axis is 0(here, level is 1).
+
+      LoD(x0) = {{0,2,4}, {0,1,2,3,4}}; Dims(x0) = (4,3,4)
+      LoD(x1) = {{0,3,5}, {0,1,3,4,5}}; Dims(x1) = (5,3,4)
+      LoD(Out) = {{0,5,9}, {0,2,5,7,9}}; Dims(Out) = (9,3,4)
+      
+    NOTE: The levels of all the inputs should be the same.
+    )DOC");
+  }
+};
+
+class SequenceConcatGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "The gradient of Out should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName("X")),
+                   "The gradient of X should not be null.");
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_concat, ops::SequenceConcatOp, ops::SequenceConcatOpMaker,
+            sequence_concat_grad, ops::SequenceConcatGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_concat,
+    ops::SequenceConcatOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_concat_grad,
+    ops::SequenceConcatGradOpKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_concat_op.cu b/paddle/operators/sequence_concat_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8dc4764785871262d21a5631cc9e8b805ba84244
--- /dev/null
+++ b/paddle/operators/sequence_concat_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/sequence_concat_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_concat,
+    ops::SequenceConcatOpKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sequence_concat_grad,
+    ops::SequenceConcatGradOpKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_concat_op.h b/paddle/operators/sequence_concat_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..6adf96120c99f9b84a1ff947058e65ac3ddff1d4
--- /dev/null
+++ b/paddle/operators/sequence_concat_op.h
@@ -0,0 +1,155 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/strided_memcpy.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+using LoD = framework::LoD;
+
+template <typename T>
+LoD concatLoD(const std::vector<const T*> ins, const size_t axis,
+              const size_t level) {
+  auto out_lod = ins[0]->lod();
+  const size_t n = ins.size();
+  if (axis == 0UL) {
+    for (size_t i = 1; i < n; ++i) {
+      for (size_t j = 0; j < ins[i]->lod()[0].size(); ++j) {
+        out_lod[0][j] += ins[i]->lod()[0][j];
+      }
+
+      if (ins[0]->NumLevels() == 2) {
+        for (size_t j = 1; j < ins[i]->lod()[1].size(); ++j) {
+          if (level == 0UL) {
+            out_lod[1].push_back(out_lod[1].back() + ins[i]->lod()[1][j] -
+                                 ins[i]->lod()[1][j - 1]);
+          } else if (level == 1UL) {
+            out_lod[1][j] += ins[1]->lod()[1][j];
+          }
+        }
+      }
+    }
+  }
+  return out_lod;
+}
+
+template <typename Place, typename T>
+class SequenceConcatOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+    const size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    const size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
+    const size_t n = ins.size();
+
+    for (size_t i = 1; i < n; ++i) {
+      PADDLE_ENFORCE_EQ(ins[0]->NumLevels(), ins[i]->NumLevels(),
+                        "The levels of all the input LoDTensors "
+                        "should be the same.");
+      PADDLE_ENFORCE_EQ(ins[0]->dims().size(), ins[i]->dims().size(),
+                        "The dimension size of all the input LoDTensors "
+                        "should be the same.");
+
+      const size_t dims_size = ins[i]->dims().size();
+      for (size_t j = 0; j < dims_size; ++j) {
+        if (j == axis) continue;
+        PADDLE_ENFORCE_EQ(ins[0]->dims()[j], ins[i]->dims()[j],
+                          "Except for the dimension of the specified "
+                          "axis along which all the inputs are concatenated, "
+                          "dimensions of all the other axises of the input "
+                          "LoDTensors should be the same.");
+      }
+    }
+    PADDLE_ENFORCE_GT(ins[0]->NumLevels(), level,
+                      "The levels of all the input LoDTensors "
+                      "should be greater than the specify level");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
+    out->set_lod(out_lod);
+
+    auto out_lod_level = out_lod[level];
+    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
+      Tensor out_t = out->Slice(static_cast<int>(out_lod_level[i]),
+                                static_cast<int>(out_lod_level[i + 1]));
+      auto out_stride = framework::stride(out_t.dims());
+      size_t offset = 0;
+
+      for (size_t j = 0; j < n; ++j) {
+        auto in_lod_level = ins[j]->lod()[level];
+        auto in_stride = framework::stride(ins[j]->dims());
+        Tensor in_t = ins[j]->Slice(static_cast<int>(in_lod_level[i]),
+                                    static_cast<int>(in_lod_level[i + 1]));
+        size_t axis_dim = in_t.dims()[axis];
+        StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                         in_t.dims(), out_stride, out_t.data<T>() + offset);
+        offset += axis_dim * in_stride[axis];
+      }
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SequenceConcatGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto ins = ctx.MultiInput<framework::LoDTensor>("X");
+    auto* out_grad =
+        ctx.Input<framework::LoDTensor>(framework::GradVarName("Out"));
+    auto x_grads =
+        ctx.MultiOutput<framework::LoDTensor>(framework::GradVarName("X"));
+    size_t axis = static_cast<size_t>(ctx.Attr<int>("axis"));
+    size_t level = static_cast<size_t>(ctx.Attr<int>("level"));
+    const size_t n = x_grads.size();
+
+    // Set Grad(X) LoD as X
+    for (size_t i = 0; i < n; i++) {
+      x_grads[i]->set_lod(ins[i]->lod());
+      x_grads[i]->mutable_data<T>(ctx.GetPlace());
+    }
+
+    auto out_lod = concatLoD<LoDTensor>(ins, axis, level);
+    auto out_lod_level = out_lod[level];
+
+    for (size_t i = 0; i < out_lod_level.size() - 1; ++i) {
+      Tensor out_grad_t =
+          out_grad->Slice(static_cast<int>(out_lod_level[i]),
+                          static_cast<int>(out_lod_level[i + 1]));
+      auto out_grad_stride = framework::stride(out_grad_t.dims());
+      size_t offset = 0;
+
+      for (size_t j = 0; j < n; ++j) {
+        auto x_grad_lod_level = x_grads[j]->lod()[level];
+        auto x_grad_stride = framework::stride(x_grads[j]->dims());
+        Tensor x_grad_t =
+            x_grads[j]->Slice(static_cast<int>(x_grad_lod_level[i]),
+                              static_cast<int>(x_grad_lod_level[i + 1]));
+        size_t axis_dim = x_grad_t.dims()[axis];
+        StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>() + offset,
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
+        offset += axis_dim * out_grad_stride[axis];
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sequence_pool_op.cc b/paddle/operators/sequence_pool_op.cc
index 17685ea654715f6996e17f6228f266c3aa1ee424..e3f5d509a85537669237b8fd0ed44efe8abb6874 100644
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -21,12 +21,11 @@ class SequencePoolOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
-                   "Input(X) of SequenceAvgPoolOp should not be null.");
+                   "Input(X) of SequencePoolOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
-                   "Output(Out) of SequenceAvgPoolOp should not be null.");
+                   "Output(Out) of SequencePoolOp should not be null.");
     ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
   }
 };
@@ -36,11 +35,10 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
   SequencePoolOpMaker(framework::OpProto* proto,
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
-             "A float LoDTensor, the variable-length input of SequencePoolOp");
-    AddOutput(
-        "Out",
-        "A float LoDTensor, the variable-length output of SequencePoolOp.");
+    AddInput("X", "(LoDTensor), the variable-length input of SequencePoolOp");
+    AddOutput("Out",
+              "(Tensor), output of SequencePoolOp, which does not contain LoD "
+              "infomation.");
     AddAttr<int>(
         "strategy",
         "(int, default AVERAGE) the pooling strategy of SequencePoolOp.")
@@ -49,13 +47,13 @@ class SequencePoolOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
     SequencePoolOp pools features of all time-steps of each instance.
 
-    For a mini-batch of 3 variable lengths sentences, containing 2, 3, and 2 time-steps:
+    For a mini-batch of 3 variable-length sentences, containing 2, 3, and 2 time-steps:
 
-    Assume X is a [7,M,N] float LoDTensor, and X->lod()[0] = [0, 2, 5, 7].
+    Assume X is a [7,M,N] LoDTensor, and X->lod()[0] = [0, 2, 5, 7], 7=2+3+2.
     Besides, for the sake of simplicity, we assume M=1 and N=1,
     and the value of X = [[1, 3], [2, 4, 6], [5, 1]].
 
-    Thus, Out is a [3,1,1] float LoDTensor, but Out->lod() is nullptr.
+    Thus, Out is a [3,1,1] Tensor without LoD infomation.
     And for different strategy, the value of Out is as follows:
 
     - AVERAGE: [2, 4, 3], where 2=(1+3)/2, 4=(2+4+6)/3, 3=(5+1)/2
@@ -73,8 +71,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Gradient of Out should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("X"), "The input X should not be null.");
diff --git a/paddle/operators/sequence_pool_op.h b/paddle/operators/sequence_pool_op.h
index cb80586e88f8d9e31b7b91a54f5e05ac6fa73f0f..0de6cafe9ca83f09636a69b5579d19afde1c73b5 100644
--- a/paddle/operators/sequence_pool_op.h
+++ b/paddle/operators/sequence_pool_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -38,7 +39,7 @@ enum SeqPoolType {
 };
 
 template <typename Place, typename T>
-class SequencePoolKernel : public framework::OpKernel {
+class SequencePoolKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
@@ -63,9 +64,9 @@ class SequencePoolKernel : public framework::OpKernel {
     out->mutable_data<T>(context.GetPlace());
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
-      Tensor in_t = in->Slice<T>(static_cast<int>(lod_level_0[i]),
-                                 static_cast<int>(lod_level_0[i + 1]));
-      Tensor out_t = out->Slice<T>(i, i + 1);
+      Tensor in_t = in->Slice(static_cast<int>(lod_level_0[i]),
+                              static_cast<int>(lod_level_0[i + 1]));
+      Tensor out_t = out->Slice(i, i + 1);
       int64_t h = static_cast<int64_t>(lod_level_0[i + 1] - lod_level_0[i]);
       auto in_e = EigenMatrix<T>::From(in_t, framework::make_ddim({h, w}));
       auto out_e = EigenVector<T>::Flatten(out_t);
@@ -77,6 +78,16 @@ class SequencePoolKernel : public framework::OpKernel {
         case SUM:
           out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}}));
           break;
+        case SQRT:
+          out_e.device(place) = in_e.sum(Eigen::array<int, 1>({{0}})) /
+                                std::sqrt(static_cast<T>(h));
+          break;
+        case LAST:
+          out_e.device(place) = in_e.chip(h - 1, 0);
+          break;
+        case FIRST:
+          out_e.device(place) = in_e.chip(0, 0);
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
@@ -85,7 +96,7 @@ class SequencePoolKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class SequencePoolGradKernel : public framework::OpKernel {
+class SequencePoolGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in = context.Input<LoDTensor>("X");
@@ -98,11 +109,16 @@ class SequencePoolGradKernel : public framework::OpKernel {
     int64_t w = in->numel() / dims[0];
 
     in_g->mutable_data<T>(context.GetPlace());
+    if (strategy == LAST || strategy == FIRST) {
+      // set X@Grad be zero at first when strategy is LAST/FIRST
+      math::SetConstant<Place, T> functor;
+      functor(context.device_context(), in_g, 0);
+    }
     auto place = context.GetEigenDevice<Place>();
     for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
-      auto in_g_t = in_g->Slice<T>(static_cast<int>(lod[i]),
-                                   static_cast<int>(lod[i + 1]));
-      auto out_g_t = out_g->Slice<T>(i, i + 1);
+      auto in_g_t =
+          in_g->Slice(static_cast<int>(lod[i]), static_cast<int>(lod[i + 1]));
+      auto out_g_t = out_g->Slice(i, i + 1);
       int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
       auto in_g_e = EigenMatrix<T>::From(in_g_t, {h, w});
       auto out_g_e = EigenMatrix<T>::From(out_g_t, {1, w});
@@ -115,6 +131,16 @@ class SequencePoolGradKernel : public framework::OpKernel {
         case SUM:
           in_g_e.device(place) = (out_g_e).broadcast(bcast);
           break;
+        case SQRT:
+          in_g_e.device(place) =
+              (out_g_e / std::sqrt(static_cast<T>(h))).broadcast(bcast);
+          break;
+        case LAST:
+          in_g_e.chip(h - 1, 0).device(place) = out_g_e;
+          break;
+        case FIRST:
+          in_g_e.chip(0, 0).device(place) = out_g_e;
+          break;
         default:
           PADDLE_THROW("unsupported pooling strategy");
       }
diff --git a/paddle/operators/sequence_softmax_op.cc b/paddle/operators/sequence_softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c891ab1fdcbb167453462c45b00b4632e663dd0e
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.cc
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceSoftmaxOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceSoftmaxOp should not be null.");
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceSoftmaxOpMaker(framework::OpProto* proto,
+                         framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(LoDTensor) 1-D or 2-D input LoDTensor with the 2-nd dimension "
+             "of length 1.");
+    AddOutput("Out",
+              "(LoDTensor) 1-D or 2-D output LoDTensor with the 2-nd dimension "
+              "of length 1.");
+    AddComment(R"DOC(
+SequenceSoftmaxOp computes softmax activation among all time-steps for each
+sequence. The dimension of each time-step should be 1. Thus, the shape of
+input Tensor can be either [N, 1] or [N], where N is the sum of all sequences'
+lengths.
+
+Equation:
+    for i-th sequence in a mini-batch:
+        Out(X[lod[i]:lod[i+1]], :) =
+            exp(X[lod[i]:lod[i+1], :]) / sum(exp(X[lod[i]:lod[i+1], :]))
+
+For example, for a mini-batch of 3 sequences with variable-length,
+each containing 2, 3, 2 time-steps, the lod of which is [0, 2, 5, 7],
+then softmax will be computed among X[0:2, :], X[2:5, :], X[5:7, :]
+and N turns out to be 7.
+)DOC");
+  }
+};
+
+class SequenceSoftmaxGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Out"),
+                   "Input(Out) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->HasInput(framework::GradVarName("Out")),
+        "Input(Out@GRAD) of SequenceSoftmaxGradOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceSoftmaxOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) of SequenceSoftmaxOp should not be null.");
+
+    PADDLE_ENFORCE_EQ(
+        ctx->GetInputDim("Out"),
+        ctx->GetInputDim(framework::GradVarName("Out")),
+        "Input(Out) and Input(Out@GRAD) of SequenceSoftmaxGradOp should be of "
+        "the same shape.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sequence_softmax, ops::SequenceSoftmaxOp,
+            ops::SequenceSoftmaxOpMaker, sequence_softmax_grad,
+            ops::SequenceSoftmaxGradOp);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sequence_softmax_op.cu b/paddle/operators/sequence_softmax_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f2a1e3d5e31ef21b95a51b287bdd1d4aa9221e89
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.cu
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/sequence_softmax_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    sequence_softmax,
+    ops::SequenceSoftmaxKernel<paddle::platform::GPUPlace, float>)
+REGISTER_OP_GPU_KERNEL(
+    sequence_softmax_grad,
+    ops::SequenceSoftmaxGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sequence_softmax_op.h b/paddle/operators/sequence_softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..3eb1e2844dff6ac94e86dcf4586bb51bc33adbec
--- /dev/null
+++ b/paddle/operators/sequence_softmax_op.h
@@ -0,0 +1,94 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/softmax.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+template <typename Place, typename T>
+class SequenceSoftmaxKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = x->lod();
+    auto dims = x->dims();
+
+    const size_t level = lod.size() - 1;
+    PADDLE_ENFORCE_EQ(dims[0], static_cast<int64_t>(lod[level].back()),
+                      "The first dimension of Input(X) should be equal to the "
+                      "sum of all sequences' lengths.");
+    PADDLE_ENFORCE_EQ(dims[0], x->numel(),
+                      "The width of each timestep in Input(X) of "
+                      "SequenceSoftmaxOp should be 1.");
+
+    out->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+      Tensor x_i = x->Slice(start_pos, end_pos);
+      Tensor out_i = out->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      x_i.Resize(dims_i);
+      out_i.Resize(dims_i);
+      math::SoftmaxFunctor<Place, T>()(ctx.device_context(), &x_i, &out_i);
+    }
+  }
+};
+
+template <typename Place, typename T>
+class SequenceSoftmaxGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* out = ctx.Input<LoDTensor>("Out");
+    auto* out_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
+    auto* x = ctx.Input<LoDTensor>("X");
+    auto* x_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
+
+    auto lod = x->lod();
+    const size_t level = lod.size() - 1;
+
+    x_grad->mutable_data<T>(ctx.GetPlace());
+    for (int i = 0; i < static_cast<int>(lod[level].size()) - 1; ++i) {
+      int start_pos = static_cast<int>(lod[level][i]);
+      int end_pos = static_cast<int>(lod[level][i + 1]);
+
+      Tensor out_i = out->Slice(start_pos, end_pos);
+      Tensor out_grad_i = out_grad->Slice(start_pos, end_pos);
+      Tensor x_grad_i = x_grad->Slice(start_pos, end_pos);
+
+      // Reshape from (end_pos - start_pos) x 1UL to 1UL x (end_pos - start_pos)
+      framework::DDim dims_i = framework::make_ddim({1UL, end_pos - start_pos});
+      out_i.Resize(dims_i);
+      out_grad_i.Resize(dims_i);
+      x_grad_i.Resize(dims_i);
+      math::SoftmaxGradFunctor<Place, T>()(ctx.device_context(), &out_i,
+                                           &out_grad_i, &x_grad_i);
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 3bce95535cf10c0df95b503c6e362b3f0ba2e723..2acb96d1b4f5903ff6c57b10e7621c8adaf73171 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -21,30 +21,34 @@ class SGDOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("param"),
-                   "Input(param) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("grad"),
-                   "Input(grad) of SGDOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("param_out"),
-                   "Output(param_out) of SGDOp should not be null.");
-
-    auto param_dim = ctx->GetInputDim("param");
-    PADDLE_ENFORCE_EQ(param_dim, ctx->GetInputDim("grad"),
-                      "Two input of SGD Op's dimension must be same.");
-    ctx->SetOutputDim("param_out", param_dim);
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Param"),
+                   "Input(Param) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                   "Input(Grad) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                   "Input(LearningRate) of SGDOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                   "Output(ParamOut) of SGDOp should not be null.");
+
+    auto lr_dims = ctx->GetInputDim("LearningRate");
+    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                      "Learning rate should have 1 element");
+    auto param_dim = ctx->GetInputDim("Param");
+    // TODO(qijun): check dimensions of Param and Grad at complie
+    // and run time.
+    ctx->SetOutputDim("ParamOut", param_dim);
   }
 };
 
 class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+  SGDOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("param", "input parameter");
-    AddInput("grad", "input gradient");
-    AddOutput("param_out", "output parameter");
-    AddAttr<float>("learning_rate", "learning rate of sgd");
+    AddInput("Param", "Input parameter");
+    AddInput("LearningRate", "Learning rate of SGD");
+    AddInput("Grad", "Input gradient");
+    AddOutput("ParamOut", "output parameter");
     AddComment(R"DOC(
 
 Simplest sgd algorithm.
@@ -54,6 +58,38 @@ param_out = param - learning_rate * grad;
 )DOC");
   }
 };
+
+template <typename T>
+struct SparseSGDFunctor<platform::CPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input,
+                  const framework::Tensor& learning_rate,
+                  framework::Tensor* output) {
+    auto in_height = input.height();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+    auto& in_value = input.value();
+    auto& in_rows = input.rows();
+
+    int64_t in_row_numel = in_value.numel() / in_rows.size();
+    PADDLE_ENFORCE_EQ(in_row_numel, output->numel() / in_height);
+
+    auto* in_data = in_value.data<T>();
+    auto* out_data = output->data<T>();
+    auto* lr = learning_rate.data<T>();
+
+    for (size_t i = 0; i < in_rows.size(); i++) {
+      for (int64_t j = 0; j < in_row_numel; j++) {
+        out_data[in_rows[i] * in_row_numel + j] -=
+            lr[0] * in_data[i * in_row_numel + j];
+      }
+    }
+  }
+};
+
+template struct SparseSGDFunctor<platform::CPUPlace, float>;
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index f5ba6d3c29f8dfbfdea4fbf2c3d5fd7f5b358666..106f9b746ba6614d8fa68b677c47ec04ed26fb81 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -14,6 +14,66 @@
 
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+
+namespace {
+template <typename T>
+__global__ void SparseSGDFunctorKernel(const T* selected_rows,
+                                       const int64_t* rows,
+                                       const T* learning_rate, T* tensor_out,
+                                       int64_t row_numel, int block_size) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  selected_rows += ty * row_numel;
+  tensor_out += rows[ty] * row_numel;
+
+  for (int index = tid; index < row_numel; index += block_size) {
+    // Since index in rows of SelectedRows can be duplicate, we have to use
+    // Atomic Operation to avoid concurrent write error.
+    paddle::platform::CudaAtomicAdd(
+        tensor_out + index, -1.0 * learning_rate[0] * selected_rows[index]);
+  }
+}
+}  // namespace
+
+template <typename T>
+struct SparseSGDFunctor<platform::GPUPlace, T> {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input,
+                  const framework::Tensor& learning_rate,
+                  framework::Tensor* output) {
+    auto in_height = input.height();
+    auto out_dims = output->dims();
+    PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
+
+    auto& in_value = input.value();
+    auto& in_rows = input.rows();
+
+    int64_t in_row_numel = in_value.numel() / in_rows.size();
+    PADDLE_ENFORCE_EQ(in_row_numel, output->numel() / in_height);
+
+    auto* in_data = in_value.data<T>();
+    auto* out_data = output->data<T>();
+
+    int block_size = 256;
+    dim3 threads(block_size, 1);
+    dim3 grid(1, in_rows.size());
+    SparseSGDFunctorKernel<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(in_data, in_rows.data(), learning_rate.data<T>(),
+                              out_data, in_row_numel, block_size);
+  }
+};
+
+template struct SparseSGDFunctor<platform::GPUPlace, float>;
+
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(sgd,
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index f8888f9c362e1c39af42236bb3a23be37aa3ae15..78b595fc6c63d775b627f23cafa9458f1dadd4e5 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -15,34 +15,53 @@ limitations under the License. */
 #pragma once
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/framework/selected_rows.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename Place, typename T>
+struct SparseSGDFunctor {
+  void operator()(const platform::DeviceContext& context,
+                  const framework::SelectedRows& input,
+                  const framework::Tensor& learning_rate,
+                  framework::Tensor* output);
+};
 
 template <typename Place, typename T>
-class SGDOpKernel : public framework::OpKernel {
+class SGDOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
-    auto param = ctx.Input<Tensor>("param");
-    auto grad = ctx.Input<Tensor>("grad");
-    auto param_out = ctx.Output<Tensor>("param_out");
-    float lr = ctx.Attr<float>("learning_rate");
+    auto* param = ctx.Input<framework::Tensor>("Param");
+    auto* param_out = ctx.Output<framework::Tensor>("ParamOut");
+    auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
 
-    param_out->mutable_data<T>(ctx.GetPlace());
+    auto* grad_var = ctx.InputVar("Grad");
+    // Actually, all tensors are LoDTensor except SelectedRows.
+    if (grad_var->IsType<framework::LoDTensor>()) {
+      param_out->mutable_data<T>(ctx.GetPlace());
+      auto* grad = ctx.Input<framework::Tensor>("Grad");
 
-    auto p = EigenVector<T>::Flatten(*param);
-    auto g = EigenVector<T>::Flatten(*grad);
-    auto o = EigenVector<T>::Flatten(*param_out);
-    auto place = ctx.GetEigenDevice<Place>();
+      auto p = framework::EigenVector<T>::Flatten(*param);
+      auto g = framework::EigenVector<T>::Flatten(*grad);
+      auto o = framework::EigenVector<T>::Flatten(*param_out);
+      auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
+      auto place = ctx.GetEigenDevice<Place>();
 
-    o.device(place) = p - lr * g;
+      Eigen::DSizes<int, 1> grad_dsize(grad->numel());
+      o.device(place) = p - lr.broadcast(grad_dsize) * g;
+    } else if (grad_var->IsType<framework::SelectedRows>()) {
+      // TODO(qijun): In Sparse SGD operator, in-place update is enforced.
+      // This manual optimization brings difficulty to track data dependency.
+      // It's better to find a more elegant solution.
+      PADDLE_ENFORCE_EQ(param, param_out);
+      auto* grad = ctx.Input<framework::SelectedRows>("Grad");
+      SparseSGDFunctor<Place, T> functor;
+      functor(ctx.device_context(), *grad, *learning_rate, param_out);
+    } else {
+      PADDLE_THROW("Unsupported Variable Type of Grad");
+    }
   }
 };
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e781c8db208464cb94d94d1914e50f5aba3db2c6
--- /dev/null
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -0,0 +1,148 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+
+namespace paddle {
+namespace operators {
+
+using framework::Tensor;
+
+class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Labels)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+
+    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
+  }
+};
+
+class SigmoidCrossEntropyWithLogitsGradOp
+    : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
+                   "Input(Out@GRAD) shoudl be not null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Output(X@GRAD) should be not null.");
+
+    auto x_dims = ctx->GetInputDim("X");
+    auto labels_dims = ctx->GetInputDim("Labels");
+    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
+                      "Input(Labels)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
+                      "Input(Out@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
+                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
+                      "The 1st dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+    PADDLE_ENFORCE_EQ(x_dims[1], dout_dims[1],
+                      "The 2nd dimension of Input(X) and Input(Out@Grad) "
+                      "should be equal.");
+
+    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+  }
+};
+
+class SigmoidCrossEntropyWithLogitsOpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  SigmoidCrossEntropyWithLogitsOpMaker(framework::OpProto* proto,
+                                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D, "
+             "where N is the batch size and D is the number of classes. "
+             "This input is a tensor of logits computed by the previous "
+             " operator. Logits are unscaled log probabilities given as "
+             "log(p/(1-p)).");
+    AddInput("Labels",
+             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
+             "and shape as X. This input is a tensor of probabalistic labels "
+             "for each logit");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>), a 2-D tensor with shape N x D "
+              " of elementwise logistic losses.");
+    AddComment(R"DOC(
+SigmoidCrossEntropyWithLogits Operator.
+
+This measures the elementwise probability error in discrete classification tasks
+in which each class is independent. This can be thought of as predicting labels
+for a data-point that are not mutually exclusive. For example, a news article
+can be about politics, technology or sports at the same time or none of these.
+
+The logistic loss is given as follows:
+
+       loss = -Labels * log(sigmoid(X)) - (1 - Labels) * log(1 - sigmoid(X))
+
+We know that sigmoid(X) = (1 / (1 + exp(-X))). By substituting this we get
+
+       loss = X - X * Labels + log(1 + exp(-X))
+
+For stability and to prevent overflow of exp(-X) when X < 0,
+we can reformulate the loss as follows:
+
+       loss = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+
+Both the input `X` and `Labels` can carry the LoD (Level of Details) information.
+However the output only shares the LoD with input `X`.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(sigmoid_cross_entropy_with_logits,
+            ops::SigmoidCrossEntropyWithLogitsOp,
+            ops::SigmoidCrossEntropyWithLogitsOpMaker,
+            sigmoid_cross_entropy_with_logits_grad,
+            ops::SigmoidCrossEntropyWithLogitsGradOp);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits,
+                       ops::SigmoidCrossEntropyWithLogitsKernel<
+                           paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..32a39956a14a206373b7b4c141dad19577d171f0
--- /dev/null
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cu
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/sigmoid_cross_entropy_with_logits_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits,
+                       ops::SigmoidCrossEntropyWithLogitsKernel<
+                           paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sigmoid_cross_entropy_with_logits_grad,
+                       ops::SigmoidCrossEntropyWithLogitsGradKernel<
+                           paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c619f181c878f08959a8ca461c60af5ffdff2a
--- /dev/null
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+// Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X)))
+template <typename Place, typename T>
+class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels =
+        context.Input<framework::Tensor>("Labels");
+    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
+    Out->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto out = framework::EigenVector<T>::Flatten(*Out);
+    auto place = context.GetEigenDevice<Place>();
+
+    // term1 = max(x, 0)
+    auto term1 = x.cwiseMax(static_cast<T>(0));
+    // term2 = x * labels
+    auto term2 = x * labels;
+    // term3 = log(1 + exp(-abs(x)))
+    auto term3 = (static_cast<T>(1) + (-(x.abs())).exp()).log();
+
+    out.device(place) = term1 - term2 + term3;
+  }
+};
+
+// dX = sigmoid(X) - labels
+template <typename Place, typename T>
+class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    const framework::Tensor *X = context.Input<framework::Tensor>("X");
+    const framework::Tensor *Labels =
+        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *dOut =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor *dX =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    dX->mutable_data<T>(context.GetPlace());
+
+    auto x = framework::EigenVector<T>::Flatten(*X);
+    auto labels = framework::EigenVector<T>::Flatten(*Labels);
+    auto dout = framework::EigenVector<T>::Flatten(*dOut);
+    auto dx = framework::EigenVector<T>::Flatten(*dX);
+    auto place = context.GetEigenDevice<Place>();
+
+    auto sigmoid_x = static_cast<T>(1) / (static_cast<T>(1) + (-x).exp());
+    dx.device(place) = dout * (sigmoid_x - labels);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index 2d197e3b1b763fa87939623d47728aab3bff7cd1..758481943d463f22eb6c6e0be9a99ad99161da5b 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -21,8 +21,7 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
 
@@ -63,11 +62,13 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("InsideWeight",
              "Optional input tensor of smooth l1 loss op with the same shape "
              "as X. If provided, the result of (X - Y) will be multiplied "
-             "by this tensor element by element.");
+             "by this tensor element by element.")
+        .AsDispensable();
     AddInput("OutsideWeight",
              "Optinal input of smooth l1 loss op with the same shape as X."
              "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.");
+             "this tensor element by element.")
+        .AsDispensable();
     AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
         .AsIntermediate();
     AddOutput("Out", "Smooth l1 loss.");
@@ -93,8 +94,7 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     auto in_dims = ctx->GetInputDim("X");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
diff --git a/paddle/operators/smooth_l1_loss_op.h b/paddle/operators/smooth_l1_loss_op.h
index 0604fb5e1c2f17c702208520a1d23bd5c3c65b5d..39d0070b6c8909b8f433de48038240e851d9d6cf 100644
--- a/paddle/operators/smooth_l1_loss_op.h
+++ b/paddle/operators/smooth_l1_loss_op.h
@@ -45,7 +45,7 @@ struct SmoothL1LossForward {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class SmoothL1LossKernel : public framework::OpKernel {
+class SmoothL1LossKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
@@ -115,7 +115,7 @@ struct SmoothL1LossBackward {
 };
 
 template <typename Place, typename T, typename AttrType = T>
-class SmoothL1LossGradKernel : public framework::OpKernel {
+class SmoothL1LossGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("InsideWeight");
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index e353afee3e10247fbd5c7f4282c366cd1bc39552..00fd0b32a9b3c0dd9fedf7b7621b1f15e5c4ce93 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -21,8 +21,7 @@ class SoftmaxOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SoftmaxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Y"),
@@ -68,8 +67,7 @@ class SoftmaxOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")),
                    "Input(Y@GRAD) should be not null.");
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 7220f486be055e1b841a06b15f519717c54f575c..2c08853f4f615bfe95f51aa20776ddddcdaa8f61 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -26,46 +26,31 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SoftmaxKernel : public framework::OpKernel {
+class SoftmaxKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto X = context.Input<Tensor>("X");
-    auto Y = context.Output<Tensor>("Y");
+    auto* X = context.Input<Tensor>("X");
+    auto* Y = context.Output<Tensor>("Y");
 
     // allocate memory on device.
     Y->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<Place, T>()(context, X, Y);
+    math::SoftmaxFunctor<Place, T>()(context.device_context(), X, Y);
   }
 };
 
 template <typename Place, typename T>
-class SoftmaxGradKernel : public framework::OpKernel {
+class SoftmaxGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    auto Y = context.Input<Tensor>("Y");
-    auto dY = context.Input<Tensor>(framework::GradVarName("Y"));
-    auto dX = context.Output<Tensor>(framework::GradVarName("X"));
-    dX->mutable_data<T>(context.GetPlace());
-
-    const int batch_size = Y->dims()[0];
-    const int class_num = Y->dims()[1];
-
-    Eigen::DSizes<int, 1> along_class(1);
-    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
-    Eigen::DSizes<int, 2> one_by_class(1, class_num);
+    auto* Y = context.Input<Tensor>("Y");
+    auto* dY = context.Input<Tensor>(framework::GradVarName("Y"));
+    auto* dX = context.Output<Tensor>(framework::GradVarName("X"));
 
-    auto Y_eigen = EigenMatrix<T>::From(*Y);
-    auto dY_eigen = EigenMatrix<T>::From(*dY);
-    auto dX_eigen = EigenMatrix<T>::From(*dX);
-    auto place = context.GetEigenDevice<Place>();
+    // allocate memory on device.
+    dX->mutable_data<T>(context.GetPlace());
 
-    auto dot = (Y_eigen * dY_eigen)
-                   .sum(along_class)
-                   .eval()
-                   .reshape(batch_by_one)
-                   .broadcast(one_by_class);
-    dX_eigen.device(place) = (dY_eigen - dot) * Y_eigen;
+    math::SoftmaxGradFunctor<Place, T>()(context.device_context(), Y, dY, dX);
   }
 };
 
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cc b/paddle/operators/softmax_with_cross_entropy_op.cc
index e2299b254458cdd42dee4683561d4d5c81653fb1..942fbb42df8bb90b86bd097832a15b320a857750 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
+#include <paddle/function/TensorType.h>
+#include <iostream>
 
 namespace paddle {
 namespace operators {
@@ -26,15 +28,14 @@ class SoftmaxWithCrossEntropyOpMaker
     AddInput("Logits",
              "(Tensor, default: Tensor<float>), The unscaled log probabilities "
              "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
-             "and K is the class number.")
-        .NotInGradient();
-    AddInput(
-        "Label",
-        "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
-        "tensor. "
-        "If softLable is set to 0, Label is a Tensor<int> with shape [N x 1]. "
-        "If softLable is set to 1, Label is a Tensor<float/double> "
-        "with shape [N x K].");
+             "and K is the class number.");
+    AddInput("Label",
+             "(Tensor, default: Tensor<int>), The ground truth which is a 2-D "
+             "tensor. "
+             "If softLable is set to 0, Label is a Tensor<int> with shape [N x "
+             "1]. "
+             "If softLable is set to 1, Label is a Tensor<float/double> "
+             "with shape [N x K].");
     AddOutput(
         "Softmax",
         "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
@@ -45,7 +46,7 @@ class SoftmaxWithCrossEntropyOpMaker
               "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
               "entropy loss with shape [N x 1].");
     AddAttr<bool>(
-        "softLabel",
+        "soft_label",
         "(bool, default: false), A flag to indicate whether to interpretate "
         "the given labels as soft labels.")
         .SetDefault(false);
@@ -81,8 +82,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("Logits"),
                    "Input(Logits) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
@@ -99,13 +99,13 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The labels should be a 2-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
-                        "If Attr(softLabel) == true, the 2nd dimension of "
+                        "If Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "If Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
@@ -115,14 +115,19 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
     ctx->ShareLoD("Logits", /*->*/ "Softmax");
     ctx->ShareLoD("Logits", /*->*/ "Loss");
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(ctx.Input<Tensor>("Logits")->type());
+  }
 };
 
 class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
                    "Input(Loss@Grad) should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Softmax"),
@@ -136,19 +141,45 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
                       "The labels should be a 2-D tensor.");
 
-    if (ctx->Attrs().Get<bool>("softLabel")) {
+    if (ctx->Attrs().Get<bool>("soft_label")) {
       PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
-                        "When Attr(softLabel) == true, the 2nd dimension of "
+                        "When Attr(soft_label) == true, the 2nd dimension of "
                         "Input(X) and Input(Label) should be equal.");
     } else {
       PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
-                        "When Attr(softLabel) == false, the 2nd dimension of "
+                        "When Attr(soft_label) == false, the 2nd dimension of "
                         "Input(Label) should be 1.");
     }
 
     ctx->SetOutputDim(framework::GradVarName("Logits"),
                       ctx->GetInputDim("Softmax"));
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::ToDataType(
+        ctx.Input<Tensor>(framework::GradVarName("Loss"))->type());
+  }
+};
+
+class SoftmaxGradMaker : public framework::SingleGradOpDescMaker {
+ public:
+  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
+
+ protected:
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
+    auto* grad_op = new framework::OpDescBind();
+    grad_op->SetType("softmax_with_cross_entropy_grad");
+    grad_op->SetInput("Label", Input("Label"));
+    grad_op->SetInput("Softmax", Output("Softmax"));
+    grad_op->SetInput("Loss", Output("Loss"));
+    grad_op->SetInput(framework::GradVarName("Softmax"), OutputGrad("Softmax"));
+    grad_op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss"));
+    grad_op->SetOutput(framework::GradVarName("Logits"), InputGrad("Logits"));
+    grad_op->SetAttrMap(Attrs());
+    return std::unique_ptr<framework::OpDescBind>(grad_op);
+  }
 };
 
 }  // namespace operators
@@ -156,10 +187,10 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
 
 namespace ops = paddle::operators;
 
-REGISTER_OP(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
-            ops::SoftmaxWithCrossEntropyOpMaker,
-            softmax_with_cross_entropy_grad,
-            ops::SoftmaxWithCrossEntropyOpGrad);
+REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp,
+                  ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker);
+REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
+                  ops::SoftmaxWithCrossEntropyOpGrad);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy,
                        ops::SoftmaxWithCrossEntropyKernel<float>);
 REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad,
diff --git a/paddle/operators/softmax_with_cross_entropy_op.cu b/paddle/operators/softmax_with_cross_entropy_op.cu
index 1cf4296dccf68aece6fdfb7910a9c68449633b76..68ac2b0ea36dda55ac1161eecb80f03178b4f303 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/operators/softmax_with_cross_entropy_op.cu
@@ -53,7 +53,7 @@ __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
 }  // namespace
 
 template <typename T>
-class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
@@ -66,14 +66,16 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::GPUPlace, T>()(context, logits, softmax);
+    math::SoftmaxFunctor<platform::GPUPlace, T>()(context.device_context(),
+                                                  logits, softmax);
     math::CrossEntropyFunctor<platform::GPUPlace, T>()(
-        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+        context.device_context(), loss, softmax, labels,
+        context.Attr<bool>("soft_label"));
   }
 };
 
 template <typename T>
-class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()),
@@ -83,7 +85,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
         context.Input<Tensor>(framework::GradVarName("Loss"))->data<T>();
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
     T* logit_grad_data = logit_grad->data<T>();
 
     const int batch_size = logit_grad->dims()[0];
@@ -91,7 +93,7 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel {
     int block = 512;
     int grid = (batch_size * class_num + block - 1) / block;
 
-    if (context.Attr<bool>("softLabel")) {
+    if (context.Attr<bool>("soft_label")) {
       const T* label_data = labels->data<T>();
       SoftCrossEntropyGradientKernel<T><<<
           grid, block, 0, reinterpret_cast<const platform::CUDADeviceContext&>(
diff --git a/paddle/operators/softmax_with_cross_entropy_op.h b/paddle/operators/softmax_with_cross_entropy_op.h
index bf792c1f59e2e43a98c93bddbc2aa63d646dee6f..01027cf63fc1010a226346609d583af0b400ecbb 100644
--- a/paddle/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/operators/softmax_with_cross_entropy_op.h
@@ -27,7 +27,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename T>
-class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     PADDLE_ENFORCE(platform::is_cpu_place(context.GetPlace()),
@@ -40,14 +40,16 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel {
     softmax->mutable_data<T>(context.GetPlace());
     loss->mutable_data<T>(context.GetPlace());
 
-    math::SoftmaxFunctor<platform::CPUPlace, T>()(context, logits, softmax);
+    math::SoftmaxFunctor<platform::CPUPlace, T>()(context.device_context(),
+                                                  logits, softmax);
     math::CrossEntropyFunctor<platform::CPUPlace, T>()(
-        context, loss, softmax, labels, context.Attr<bool>("softLabel"));
+        context.device_context(), loss, softmax, labels,
+        context.Attr<bool>("soft_label"));
   }
 };
 
 template <typename T>
-class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
+class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     const Tensor* out_grad =
@@ -55,10 +57,10 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel {
     const Tensor* labels = context.Input<Tensor>("Label");
     Tensor* logit_grad =
         context.Output<Tensor>(framework::GradVarName("Logits"));
-    logit_grad->ShareDataWith<T>(*context.Input<Tensor>("Softmax"));
+    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
 
     const int class_num = logit_grad->dims()[1];
-    if (context.Attr<bool>("softLabel")) {
+    if (context.Attr<bool>("soft_label")) {
       auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
       auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
       auto lbl_mat = EigenMatrix<T>::From(*labels);
diff --git a/paddle/operators/split_op.cc b/paddle/operators/split_op.cc
index 5f4b5539affef6fe1d3c4d15fff77d983b5e107f..4a6c50f7970208b0f4141aa057bd0db715fb6152 100644
--- a/paddle/operators/split_op.cc
+++ b/paddle/operators/split_op.cc
@@ -23,8 +23,7 @@ class SplitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SplitOp should not be null.");
     PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
diff --git a/paddle/operators/split_op.h b/paddle/operators/split_op.h
index 8ab8e0ee4fea621b34da73507c53846100d61a17..fa26e5f677b18c84b45dd583004d02cab4c1d375 100644
--- a/paddle/operators/split_op.h
+++ b/paddle/operators/split_op.h
@@ -22,7 +22,7 @@ namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class SplitOpKernel : public framework::OpKernel {
+class SplitOpKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
diff --git a/paddle/operators/squared_l2_distance_op.cc b/paddle/operators/squared_l2_distance_op.cc
index 5a0cb596008a98aacf5e7b5ff70307ea1b8508e6..e360c19b47eae7fc32ae66f9e4e3873bff211b04 100644
--- a/paddle/operators/squared_l2_distance_op.cc
+++ b/paddle/operators/squared_l2_distance_op.cc
@@ -21,8 +21,7 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SquaredL2DistanceOp should not be null.");
     PADDLE_ENFORCE(ctx->HasInput("Y"),
@@ -85,8 +84,7 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Gradient of Out should not be null");
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
diff --git a/paddle/operators/squared_l2_distance_op.h b/paddle/operators/squared_l2_distance_op.h
index 097ac04fc09a10b3b624f491a847e281e41a802c..259ef4029646914f83a112b9c6d7fdf8401483f6 100644
--- a/paddle/operators/squared_l2_distance_op.h
+++ b/paddle/operators/squared_l2_distance_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SquaredL2DistanceKernel : public framework::OpKernel {
+class SquaredL2DistanceKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("X");
@@ -68,7 +68,7 @@ class SquaredL2DistanceKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class SquaredL2DistanceGradKernel : public framework::OpKernel {
+class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* in0 = context.Input<Tensor>("sub_result");
diff --git a/paddle/operators/strided_memcpy_test.cc b/paddle/operators/strided_memcpy_test.cc
index 05882a88738cfc9cc23480efe0afe504008377ca..68f064eaee5851333ddf9767b7138da83a28503d 100644
--- a/paddle/operators/strided_memcpy_test.cc
+++ b/paddle/operators/strided_memcpy_test.cc
@@ -72,7 +72,7 @@ TEST(StridedMemcpy, CPUConcat) {
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(StridedMemcpy, GPUCrop) {
   // clang-format off
   int src[] = {
@@ -157,4 +157,4 @@ TEST(StridedMemcpy, GPUConcat) {
 
 #endif
 }  // namespace operators
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/operators/sum_op.cc b/paddle/operators/sum_op.cc
index 8f62a9f4db8d39edc11949df513aebf4fa257d45..5214a8413e8f7b957015985496fe8fb4b4f8b323 100644
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/operators/sum_op.h"
 #include <vector>
+#include "paddle/operators/net_op.h"
 
 namespace paddle {
 namespace operators {
@@ -20,19 +21,19 @@ class SumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
     auto x_dims = ctx->GetInputsDim("X");
-    PADDLE_ENFORCE(!x_dims.empty(), "Input(X) of SumOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SumOp should not be null.");
 
-    auto in_dim = x_dims[0];
     size_t N = x_dims.size();
     PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
+
+    auto in_dim = x_dims[0];
     for (size_t i = 1; i < N; i++) {
       auto dim = x_dims[i];
-      PADDLE_ENFORCE(in_dim == dim, "Input tensors must have same shape");
+      PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
     }
     ctx->SetOutputDim("Out", in_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
@@ -54,21 +55,26 @@ or not. But the output only shares the LoD with the first input.
   }
 };
 
-class SumGradOp : public framework::OperatorWithKernel {
+class SumGradMaker : public framework::GradOpDescMakerBase {
  public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
+  using framework::GradOpDescMakerBase::GradOpDescMakerBase;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
-    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto x_grad_names = ctx->Outputs(framework::GradVarName("X"));
-    size_t x_length = x_grad_names.size();
-    std::vector<framework::DDim> x_grad_dims;
-    x_grad_dims.reserve(x_length);
-    for (size_t i = 0; i < x_length; ++i) {
-      x_grad_dims.push_back(out_grad_dims);
-    }
-    ctx->SetOutputsDim(framework::GradVarName("X"), x_grad_dims);
+  std::vector<std::unique_ptr<framework::OpDescBind>> operator()()
+      const override {
+    auto x_grads = InputGrad("X");
+    std::vector<std::unique_ptr<framework::OpDescBind>> grad_ops;
+    grad_ops.reserve(x_grads.size());
+    auto og = OutputGrad("Out");
+    std::transform(x_grads.begin(), x_grads.end(), std::back_inserter(grad_ops),
+                   [&og](const std::string& x_grad) {
+                     auto* grad_op = new framework::OpDescBind();
+                     grad_op->SetType("scale");
+                     grad_op->SetInput("X", og);
+                     grad_op->SetOutput("Out", {x_grad});
+                     grad_op->SetAttr("scale", 1.0f);
+                     return std::unique_ptr<framework::OpDescBind>(grad_op);
+                   });
+    return grad_ops;
   }
 };
 
@@ -76,7 +82,6 @@ class SumGradOp : public framework::OperatorWithKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(sum, ops::SumOp, ops::SumOpMaker, sum_grad, ops::SumGradOp);
+
+REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradMaker);
 REGISTER_OP_CPU_KERNEL(sum, ops::SumKernel<paddle::platform::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/sum_op.cu b/paddle/operators/sum_op.cu
index a465cf3659ba7c51338abadfc62962fb6755a39d..b1896d3cd87f47bd2573287ee37b1b72ae9ec6e8 100644
--- a/paddle/operators/sum_op.cu
+++ b/paddle/operators/sum_op.cu
@@ -14,5 +14,3 @@ limitations under the License. */
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(sum, ops::SumKernel<paddle::platform::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sum_grad,
-                       ops::SumGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 0b1e9ebaa38d455fb5e3ce8c1a39cbbcdad9a940..91e5da8b40d452db8715990cdbe2731b3aea44b9 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -22,7 +22,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class SumKernel : public framework::OpKernel {
+class SumKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto ins = context.MultiInput<Tensor>("X");
@@ -42,24 +42,5 @@ class SumKernel : public framework::OpKernel {
   }
 };
 
-template <typename Place, typename T>
-class SumGradKernel : public framework::OpKernel {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* input = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto outs = context.MultiOutput<Tensor>(framework::GradVarName("X"));
-    for (auto out : outs) {
-      out->mutable_data<T>(context.GetPlace());
-    }
-
-    auto place = context.GetEigenDevice<Place>();
-    auto in = EigenVector<T>::Flatten(*input);
-    for (auto out : outs) {
-      auto result = EigenVector<T>::Flatten(*out);
-      result.device(place) = in;
-    }
-  }
-};
-
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/top_k_op.cc b/paddle/operators/top_k_op.cc
index 5f22bf1df8720b60aba7cd75896d88cd1ad77635..d5c2c91a5fb0f639ea84d13e27de8271218da54f 100644
--- a/paddle/operators/top_k_op.cc
+++ b/paddle/operators/top_k_op.cc
@@ -21,8 +21,7 @@ class TopkOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase *ctx) const override {
+  void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of TopkOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
diff --git a/paddle/operators/top_k_op.cu b/paddle/operators/top_k_op.cu
index 53fe505b77bfac8a33803f082f8e935d3ed403b6..7be6932f1e301d06e0e232367a38bfa673ff45be 100644
--- a/paddle/operators/top_k_op.cu
+++ b/paddle/operators/top_k_op.cu
@@ -279,7 +279,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int* indices,
 }
 
 template <typename T>
-class TopkOpCUDAKernel : public framework::OpKernel {
+class TopkOpCUDAKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()),
diff --git a/paddle/operators/top_k_op.h b/paddle/operators/top_k_op.h
index ef66acc1d569282a42be64b7a5e90f3fbdb20690..4b248faa120bcfd20e70d288cce2d485d3e6371e 100644
--- a/paddle/operators/top_k_op.h
+++ b/paddle/operators/top_k_op.h
@@ -28,7 +28,7 @@ template <typename T, int MajorType = Eigen::RowMajor,
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
 template <typename Place, typename T>
-class TopkKernel : public framework::OpKernel {
+class TopkKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     // Get the top k elements of each row of input tensor
diff --git a/paddle/operators/transpose_op.cc b/paddle/operators/transpose_op.cc
index 0672f9342dac00ecc3f358450a9a203327cbb51f..d785e57c830439ad80005d9a3d4bb77faf1ae1b9 100644
--- a/paddle/operators/transpose_op.cc
+++ b/paddle/operators/transpose_op.cc
@@ -23,8 +23,7 @@ class TransposeOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
     auto x_dims = ctx->GetInputDim("X");
@@ -92,8 +91,7 @@ class TransposeOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) should not be null");
diff --git a/paddle/operators/transpose_op.h b/paddle/operators/transpose_op.h
index ea299dce72ad340b0a65ee50582dc156b5ad7abb..aaa3f47ab5545accd4d1108e0ad6f5a3062186d0 100644
--- a/paddle/operators/transpose_op.h
+++ b/paddle/operators/transpose_op.h
@@ -38,7 +38,7 @@ void EigenTranspose(const framework::ExecutionContext& context,
 }
 
 template <typename Place, typename T>
-class TransposeKernel : public framework::OpKernel {
+class TransposeKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* x = context.Input<framework::Tensor>("X");
@@ -73,7 +73,7 @@ class TransposeKernel : public framework::OpKernel {
 };
 
 template <typename Place, typename T>
-class TransposeGradKernel : public framework::OpKernel {
+class TransposeGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* out_grad =
diff --git a/paddle/operators/uniform_random_op.cc b/paddle/operators/uniform_random_op.cc
index 2771df56086ff261728af84edcdf01cda3e45e9f..39b53948e3cc58ff1d0ab481143b066b1a2fae16 100644
--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename T>
-class CPUUniformRandomKernel : public framework::OpKernel {
+class CPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* tensor = ctx.Output<framework::Tensor>("Out");
@@ -46,22 +46,27 @@ class UniformRandomOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
- protected:
-  void InferShape(framework::InferShapeContextBase* ctx) const override {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of UniformRandomOp should not be null.");
 
     PADDLE_ENFORCE(
         ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
         "uniform_random's min must less then max");
-    auto dims = Attr<std::vector<int>>("dims");
+    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
     std::vector<int64_t> temp;
-    temp.reserve(dims.size());
-    for (auto dim : dims) {
+    temp.reserve(shape.size());
+    for (auto dim : shape) {
       temp.push_back(static_cast<int64_t>(dim));
     }
     ctx->SetOutputDim("Out", framework::make_ddim(temp));
   }
+
+ protected:
+  framework::DataType IndicateDataType(
+      const framework::ExecutionContext& ctx) const override {
+    return static_cast<framework::DataType>(ctx.Attr<int>("data_type"));
+  }
 };
 
 class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -73,13 +78,15 @@ class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(Uniform random operator.
 Used to initialize tensor with uniform random generator.
 )DOC");
-    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<std::vector<int>>("shape", "the dimension of random tensor");
     AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
     AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
     AddAttr<int>("seed",
                  "Random seed of uniform random. "
                  "0 means generate a seed by system")
         .SetDefault(0);
+    AddAttr<int>("data_type", "output tensor data type")
+        .SetDefault(framework::DataType::FP32);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index 6614b53b3f990d10c82633f3c1f079acea0cd827..5612ce9eb1c644d6271b4a9bb949f685848e05c0 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -40,7 +40,7 @@ struct UniformGenerator {
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
 template <typename T>
-class GPUUniformRandomKernel : public framework::OpKernel {
+class GPUUniformRandomKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
     auto* tensor = context.Output<framework::Tensor>("Out");
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
index 6eec5d846fa5ef6b25e7646200dad1d452dda806..34913c405075ed72af30ed056f74e8b4d7482488 100644
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -25,19 +25,17 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) {
   }
 }
 
-const char* AdadeltaOptimizer::SerializeState(int* state_len) {
+std::string AdadeltaOptimizer::SerializeState() {
   AdadeltaOptimizerState state;
   state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  std::string lr_str = this->lr_policy_->SerializeState();
   state.mutable_lr_state()->ParseFromString(lr_str);
 
   TensorToProto(*parameter_, state.mutable_parameter());
   TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
   TensorToProto(*accum_delta_, state.mutable_accum_delta());
   TensorToProto(*update_delta_, state.mutable_update_delta());
-  auto str = state.SerializeAsString();
-  *state_len += str.size();
-  return str.c_str();
+  return state.SerializeAsString();
 }
 
 void AdadeltaOptimizer::DeserializeState(const std::string& str) {
diff --git a/paddle/optimizer/adadelta_optimizer.h b/paddle/optimizer/adadelta_optimizer.h
index 1d5eab097f57d049855dd171a1aa6f74c48ae0e7..bc634ee46d60abc9ffc4a31abac5c2f8edaf7aba 100644
--- a/paddle/optimizer/adadelta_optimizer.h
+++ b/paddle/optimizer/adadelta_optimizer.h
@@ -23,7 +23,7 @@ public:
     if (update_delta_) delete update_delta_;
   }
   void Update(const Tensor *gradient);
-  const char *SerializeState(int *state_len);
+  std::string SerializeState();
   void DeserializeState(const std::string &state);
 
 private:
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
index 5b92610ac547ee11cedf2e49e4d7f1db4b2da646..d915ffb8705eaa96bc96b8071a2c534d4d472273 100644
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -17,17 +17,15 @@ void AdagradOptimizer::Update(const Tensor* gradient) {
                 learning_rate * decay_ * param[i];
   }
 }
-const char* AdagradOptimizer::SerializeState(int* state_len) {
+std::string AdagradOptimizer::SerializeState() {
   AdagradOptimizerState state;
   state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  std::string lr_str = this->lr_policy_->SerializeState();
   state.mutable_lr_state()->ParseFromString(lr_str);
 
   TensorToProto(*parameter_, state.mutable_parameter());
   TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
-  auto str = state.SerializeAsString();
-  *state_len += str.size();
-  return str.c_str();
+  return state.SerializeAsString();
 }
 
 void AdagradOptimizer::DeserializeState(const std::string& str) {
diff --git a/paddle/optimizer/adagrad_optimizer.h b/paddle/optimizer/adagrad_optimizer.h
index 15d0a965ad0c6967e73b14b465168fa66eb8fba3..b2935f8aff87f710f508c5c5757dd36526ca63f9 100644
--- a/paddle/optimizer/adagrad_optimizer.h
+++ b/paddle/optimizer/adagrad_optimizer.h
@@ -19,7 +19,7 @@ public:
     if (accum_gradient_) delete accum_gradient_;
   }
   void Update(const Tensor *gradient);
-  const char *SerializeState(int *state_len);
+  std::string SerializeState();
   void DeserializeState(const std::string &state);
 
 private:
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
index 1ebb6b1e0f7b4edcbac1b28319fd4de576f85f6a..18e5896a22dc8a3c6292293fffc36ca9e3737b4c 100644
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -22,18 +22,16 @@ void AdamOptimizer::Update(const Tensor *gradient) {
   }
 }
 
-const char *AdamOptimizer::SerializeState(int *state_len) {
+std::string AdamOptimizer::SerializeState() {
   AdamOptimizerState state;
-  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  std::string lr_str = this->lr_policy_->SerializeState();
   state.mutable_lr_state()->ParseFromString(lr_str);
   state.set_num_sample_passed(num_sample_passed_);
 
   TensorToProto(*parameter_, state.mutable_parameter());
   TensorToProto(*momentums_, state.mutable_momentums());
   TensorToProto(*velocitys_, state.mutable_velocitys());
-  auto str = state.SerializeAsString();
-  *state_len += str.size();
-  return str.c_str();
+  return state.SerializeAsString();
 }
 
 void AdamOptimizer::DeserializeState(const std::string &str) {
diff --git a/paddle/optimizer/adam_optimizer.h b/paddle/optimizer/adam_optimizer.h
index 0ea4c8bb8470504282b4d6c12039791ce896e401..d25cdc0731f65e9875d2fbf67783cce62d88af60 100644
--- a/paddle/optimizer/adam_optimizer.h
+++ b/paddle/optimizer/adam_optimizer.h
@@ -25,7 +25,7 @@ public:
     if (velocitys_) delete velocitys_;
   }
   void Update(const Tensor *gradient);
-  const char *SerializeState(int *state_len);
+  std::string SerializeState();
   void DeserializeState(const std::string &state);
 
 private:
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
index 036c376e10f465c2866a230caf9224f4af5478bc..bbb1ee48214cecdc6b6cd2a400cc9d12d5e8b64a 100644
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -10,7 +10,7 @@ class LrPolicy {
 public:
   virtual ~LrPolicy() {}
   virtual double LearningRate(const uint64_t num_sample_passed) = 0;
-  virtual const char *SerializeState(int *state_len) = 0;
+  virtual std::string SerializeState() = 0;
   virtual void DeserializeState(const std::string &state) = 0;
 };
 
@@ -21,12 +21,10 @@ public:
   double LearningRate(const uint64_t num_sample_passed) {
     return learning_rate_;
   }
-  const char *SerializeState(int *state_len) {
+  std::string SerializeState() {
     LrPolicyState state;
     state.set_learning_rate(learning_rate_);
-    auto str = state.SerializeAsString();
-    *state_len = str.size();
-    return str.c_str();
+    return state.SerializeAsString();
   }
   void DeserializeState(const std::string &str) {
     LrPolicyState state;
@@ -46,14 +44,12 @@ public:
     return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
                     lr_decay_b_);
   }
-  const char *SerializeState(int *state_len) {
+  std::string SerializeState() {
     LrPolicyState state;
     state.set_learning_rate(learning_rate_);
     state.set_lr_decay_a(lr_decay_a_);
     state.set_lr_decay_b(lr_decay_b_);
-    auto str = state.SerializeAsString();
-    *state_len = str.size();
-    return str.c_str();
+    return state.SerializeAsString();
   }
   void DeserializeState(const std::string &str) {
     LrPolicyState state;
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index eb7125adee769c97e16986cabf06ea389bf4c143..a2af139d012433214b825bd68289708098b76da8 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -1,4 +1,7 @@
 #include "optimizer.h"
+#include <glog/logging.h>
+#include <cstdlib>
+#include <cstring>
 #include <string>
 
 #include "parameter_optimizer.h"
@@ -78,7 +81,13 @@ int paddle_optimizer_get_weights(paddle_optimizer* o, void** param_buffer) {
 }
 
 int paddle_optimizer_get_state(paddle_optimizer* o, const char** state) {
-  int state_len = 0;
-  *state = o->impl->SerializeState(&state_len);
+  std::string s = o->impl->SerializeState();
+  int state_len = s.size();
+
+  if (state_len > 0) {
+    *state = (char*)std::malloc(state_len);
+    std::memcpy((void*)*state, (const void*)s.c_str(), state_len);
+  }
+
   return state_len;
 }
diff --git a/paddle/optimizer/parameter_optimizer.cc b/paddle/optimizer/parameter_optimizer.cc
index f6218037925649e741d17f49af972ce2d50f8d3d..db0714635f9366b0404019688daf4708b4a0052f 100644
--- a/paddle/optimizer/parameter_optimizer.cc
+++ b/paddle/optimizer/parameter_optimizer.cc
@@ -32,6 +32,7 @@ ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
       Tensor *parameter,
       const OptimizerConfig &config) -> ParameterOptimizer * {
     if (config.optimizer() == OptimizerConfig::SGD) {
+      LOG(INFO) << "creating SGD optimizer";
       return new SGDOptimizer(parameter,
                               lr,
                               config.sgd().momentum(),
@@ -39,6 +40,7 @@ ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
                               config.sgd().nesterov());
     }
     if (config.optimizer() == OptimizerConfig::Adadelta) {
+      LOG(INFO) << "creating Adadelta optimizer";
       return new AdadeltaOptimizer(parameter,
                                    lr,
                                    config.adadelta().rho(),
@@ -46,10 +48,12 @@ ParameterOptimizer *ParameterOptimizer::Create(const std::string &config_proto,
                                    config.adadelta().decay());
     }
     if (config.optimizer() == OptimizerConfig::Adagrad) {
+      LOG(INFO) << "creating Adagrad optimizer";
       return new AdagradOptimizer(
           parameter, lr, config.adagrad().epsilon(), config.adagrad().decay());
     }
     if (config.optimizer() == OptimizerConfig::Adam) {
+      LOG(INFO) << "creating Adam optimizer";
       return new AdamOptimizer(parameter,
                                lr,
                                config.adam().beta_1(),
diff --git a/paddle/optimizer/parameter_optimizer.h b/paddle/optimizer/parameter_optimizer.h
index d89c9abb791f947172078d4dce5b1c366852591b..8319f84e1b820adf5cc0006045f2e13dffa91797 100644
--- a/paddle/optimizer/parameter_optimizer.h
+++ b/paddle/optimizer/parameter_optimizer.h
@@ -28,7 +28,7 @@ public:
                                     Tensor *parameter);
   virtual void Update(const Tensor *gradient) = 0;
   virtual float *get_weight(int *param_size) const;
-  virtual const char *SerializeState(int *state_len) = 0;
+  virtual std::string SerializeState() = 0;
   virtual void DeserializeState(const std::string &state) = 0;
 
 protected:
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index edf4ae37a9beee2911d23dd1ab23e67a18065b1b..c88fa11748716693355042d1784b33d7cfb616f1 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -85,6 +85,7 @@ public:
     for (size_t i = 0; i < opts_.size(); ++i) {
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
+      EXPECT_EQ(s, kSize);
       for (size_t j = 0; j < kSize; ++j) {
         EXPECT_EQ(newp[j], (*p)[j]);
       }
@@ -99,10 +100,20 @@ public:
   }
 
   void TestCheckPoint() {
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
-      int state_len = 0;
-      std::string state = opts_[i]->SerializeState(&state_len);
+      auto state = opts_[i]->SerializeState();
+      opts_[i]->DeserializeState(state);
+      auto state1 = opts_[i]->SerializeState();
       opts_[i]->DeserializeState(state);
+      EXPECT_EQ(state, state1);
+
+      int s = 0;
+      float* newp = (float*)opts_[i]->get_weight(&s);
+      EXPECT_EQ(s, kSize);
+      for (size_t j = 0; j < kSize; ++j) {
+        EXPECT_EQ(newp[j], (*p)[j]);
+      }
     }
   }
 
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
index e4d97cbdba545c4ba5adf5b30efd3fc9f3f744ee..4c416f55ee0bd70f9ec6e288b08a5399d8b2bf39 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
@@ -21,7 +21,22 @@ TEST(TensorToProto, Case1) {
   paddle::optimizer::Tensor t(3), t1(3);
   for (size_t i = 0; i < t.size(); ++i) {
     t[i] = i;
-    t1[i] = 0;
+    t1[i] = 10;
+  }
+
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
+  for (size_t i = 0; i < t1.size(); ++i) {
+    EXPECT_EQ(t1[i], t[i]);
+  }
+}
+
+TEST(TensorToProto, Case2) {
+  paddle::optimizer::Tensor t(1), t1(1);
+  for (size_t i = 0; i < t.size(); ++i) {
+    t[i] = i;
+    t1[i] = 10;
   }
 
   paddle::TensorProto proto;
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
index 15418faa840c19e776f293700ee886991754fb04..bf2540ecb092437e57a5970264559dc3c6ab4167 100644
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -27,16 +27,14 @@ void SGDOptimizer::Update(const Tensor *gradient) {
   }
 }
 
-const char *SGDOptimizer::SerializeState(int *state_len) {
+std::string SGDOptimizer::SerializeState() {
   SGDOptimizerState state;
   state.set_num_sample_passed(num_sample_passed_);
-  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  std::string lr_str = this->lr_policy_->SerializeState();
   state.mutable_lr_state()->ParseFromString(lr_str);
   TensorToProto(*parameter_, state.mutable_parameter());
   if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
-  auto str = state.SerializeAsString();
-  *state_len += str.size();
-  return str.c_str();
+  return state.SerializeAsString();
 }
 
 void SGDOptimizer::DeserializeState(const std::string &str) {
diff --git a/paddle/optimizer/sgd_optimizer.h b/paddle/optimizer/sgd_optimizer.h
index b74a902e1aa40a7831b36ab826d72372a3588bcf..6e1a0f0d3f9ecfeb51ccb355d65985a2e6388fb0 100644
--- a/paddle/optimizer/sgd_optimizer.h
+++ b/paddle/optimizer/sgd_optimizer.h
@@ -23,7 +23,7 @@ public:
     if (momentums_) delete momentums_;
   }
   void Update(const Tensor* gradient);
-  const char* SerializeState(int* state_len);
+  std::string SerializeState();
   void DeserializeState(const std::string& state);
 
 private:
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 895e8d6a63d1fad0ee7a6f5647402435d418b2f1..f157188a4f736319ea187052b90a17f8be9e9edb 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -265,6 +265,10 @@ public:
     addParameterType(PARAMETER_SECOND_MOMENTUM);
   }
 
+  virtual void startBatch(int64_t numSamplesProcessed) {
+    learningRate_ = calcLearningRate(numSamplesProcessed, pass_);
+  }
+
   virtual void finishBatch() { ++step_; }
 
   virtual void update(const VectorPtr vecs[],
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index daf519b91d623d4369774dc4e37dcb7b1733666b..eb850b658583f2256629d63fdb64248dbf249937 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -25,3 +25,4 @@ nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_
 
 nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
 nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
+nv_test(nccl_test SRCS nccl_test.cu DEPS dynload_cuda gpu_info device_context)
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index 2841d2a2dbec5c17ef098a06c976ca01247820f5..0c5719ef5162546578253e383209b1893c0cd71f 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -71,23 +71,32 @@ class ScopedTensorDescriptor {
 
   inline cudnnTensorDescriptor_t descriptor(const cudnnTensorFormat_t format,
                                             const cudnnDataType_t type,
-                                            const std::vector<int>& dims) {
-    // the format is not used now, but it maybe useful feature
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    // the format is not used now, will add later
     std::vector<int> strides(dims.size());
     strides[dims.size() - 1] = 1;
     for (int i = dims.size() - 2; i >= 0; i--) {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
+    // Update tensor descriptor dims setting if groups > 1
+    // FIXME(typhoonzero): Assume using NCHW order
+    std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
+    if (groups > 1) {
+      dims_with_group[1] = dims_with_group[1] / groups;
+    }
     PADDLE_ENFORCE(dynload::cudnnSetTensorNdDescriptor(
-        desc_, type, dims.size(), dims.data(), strides.data()));
+        desc_, type, dims_with_group.size(), dims_with_group.data(),
+        strides.data()));
     return desc_;
   }
 
   template <typename T>
   inline cudnnTensorDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& dims) {
-    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
-                      dims);
+                                            const std::vector<int>& dims,
+                                            const int groups = 1) {
+    return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type, dims,
+                      groups);
   }
 
  private:
@@ -106,18 +115,29 @@ class ScopedFilterDescriptor {
 
   inline cudnnFilterDescriptor_t descriptor(const cudnnTensorFormat_t format,
                                             const cudnnDataType_t type,
-                                            const std::vector<int>& kernel) {
-    // filter layout: output input spatial_dim_y spatial_dim_x
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
+    // filter layout: MCHW, where M is the number of
+    // output image channels, C is the number of input image channels,
+    // H and W is height and width of filter.
+    std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
+    if (groups > 1) {
+      // M /= groups
+      kernel_with_group[0] /= groups;
+      // NOTE: input filter(C) of the filter is already asserted to be C/groups.
+    }
     PADDLE_ENFORCE(dynload::cudnnSetFilterNdDescriptor(
-        desc_, type, format, kernel.size(), kernel.data()));
+        desc_, type, format, kernel_with_group.size(),
+        kernel_with_group.data()));
     return desc_;
   }
 
   template <typename T>
   inline cudnnFilterDescriptor_t descriptor(const DataLayout& order,
-                                            const std::vector<int>& kernel) {
+                                            const std::vector<int>& kernel,
+                                            const int groups = 1) {
     return descriptor(GetCudnnTensorFormat(order), CudnnDataType<T>::type,
-                      kernel);
+                      kernel, groups);
   }
 
  private:
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 93b472b41c8a4c3a2bfada9d4fbf0e9e1b0cc736..36450e926891342f37424447703781a33c1190ae 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -16,8 +16,8 @@ namespace paddle {
 namespace platform {
 
 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
-    const {
+Eigen::DefaultDevice* DeviceContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
   return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
@@ -35,7 +35,13 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
 
 Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
+
+template <>
+Eigen::GpuDevice*
+DeviceContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
+}
 
 class EigenCudaStreamDevice : public Eigen::StreamInterface {
  public:
@@ -90,11 +96,6 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
   mutable unsigned int* semaphore_;
 };
 
-template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
-  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
-}
-
 CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
   SetDeviceId(place_.device);
   PADDLE_ENFORCE(cudaStreamCreate(&stream_));
@@ -135,7 +136,7 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
 
 cudaStream_t CUDADeviceContext::stream() const { return stream_; }
 
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index f6a39a8e26c301296aac0af7f4e8b2c6c97ece24..ef5f19214d9ccb23b9c946bee28cb764122bd7cd 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/gpu_info.h"
@@ -27,13 +27,23 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
   virtual Place GetPlace() const = 0;
 
-  template <typename DeviceType>
-  DeviceType* get_eigen_device() const;
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType* GetEigenDevice() const;
 
   virtual void Wait() const {}
 };
@@ -51,7 +61,12 @@ class CPUDeviceContext : public DeviceContext {
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
 };
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+
 class EigenCudaStreamDevice;
 
 class CUDADeviceContext : public DeviceContext {
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 5883a55272f0f24c94d48bc43c62ddb7bef15465..8bf5174c4a5579f6f5602dd38e5a87ed3ef444a7 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -20,11 +20,11 @@ TEST(Device, Init) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::GPUPlace;
 
-  int count = paddle::platform::GetDeviceCount();
+  int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
     DeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device =
-        device_context->template get_eigen_device<Eigen::GpuDevice>();
+        device_context->template GetEigenDevice<GPUPlace>();
     ASSERT_NE(nullptr, gpu_device);
     delete device_context;
   }
@@ -34,7 +34,7 @@ TEST(Device, CUDADeviceContext) {
   using paddle::platform::CUDADeviceContext;
   using paddle::platform::GPUPlace;
 
-  int count = paddle::platform::GetDeviceCount();
+  int count = paddle::platform::GetCUDADeviceCount();
   for (int i = 0; i < count; i++) {
     CUDADeviceContext* device_context = new CUDADeviceContext(GPUPlace(i));
     Eigen::GpuDevice* gpu_device = device_context->eigen_device();
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index ceb66f84b6b01892cbaf61c79a47ae60d2589164..bb3fec1be9e811c26cc6851314e960e96fc366b3 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc DEPS dynamic_loader)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
+        DEPS dynamic_loader nccl)
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 9d8343c0b5e200b390ccda760f09816959952e9d..6b64539b0a9a4d535a53447fbcc0e458f3ac9129 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -77,6 +77,10 @@ extern void *cublas_dso_handle;
   __macro(cublasDgemmBatched);            \
   __macro(cublasCgemmBatched);            \
   __macro(cublasZgemmBatched);            \
+  __macro(cublasSgemmStridedBatched);     \
+  __macro(cublasDgemmStridedBatched);     \
+  __macro(cublasCgemmStridedBatched);     \
+  __macro(cublasZgemmStridedBatched);     \
   __macro(cublasSgetrfBatched);           \
   __macro(cublasSgetriBatched);           \
   __macro(cublasDgetrfBatched);           \
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index ae9a0a982c73de05821579d22b7f9ad99f24a92b..6feba42c0d9d618d27da12e6a6752058b296995e 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -35,6 +35,11 @@ DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
 
+DEFINE_string(nccl_dir, "",
+              "Specify path for loading nccl library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
+
 namespace paddle {
 namespace platform {
 namespace dynload {
@@ -157,6 +162,14 @@ void GetLapackDsoHandle(void** dso_handle) {
 #endif
 }
 
+void GetNCCLDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", dso_handle);
+#endif
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.h b/paddle/platform/dynload/dynamic_loader.h
index a99b05443feb909f10b2c56f4d8bdf3c6fa11e3f..c0e5452e5ae723ec314ebafde86a6ff63980be00 100644
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
@@ -58,6 +58,14 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  */
 void GetLapackDsoHandle(void** dso_handle);
 
+/**
+ * @brief    load the DSO of NVIDIA nccl
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetNCCLDsoHandle(void** dso_handle);
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.cc b/paddle/platform/dynload/nccl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f92b8d94d56047b7d3fb43b15e3c06575c8d57b
--- /dev/null
+++ b/paddle/platform/dynload/nccl.cc
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/platform/dynload/nccl.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag nccl_dso_flag;
+void *nccl_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+NCCL_RAND_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/nccl.h b/paddle/platform/dynload/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..0618c7414fd1235e81ee9d92a3a07b53d6ad6ebc
--- /dev/null
+++ b/paddle/platform/dynload/nccl.h
@@ -0,0 +1,72 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <dlfcn.h>
+#include <nccl.h>
+#include <mutex>
+#include "paddle/platform/dynload/dynamic_loader.h"
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+extern std::once_flag nccl_dso_flag;
+extern void* nccl_dso_handle;
+
+#ifdef PADDLE_USE_DSO
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name)                    \
+  struct DynLoad__##__name {                                      \
+    template <typename... Args>                                   \
+    auto operator()(Args... args) -> decltype(__name(args...)) {  \
+      using nccl_func = decltype(__name(args...)) (*)(Args...);   \
+      std::call_once(nccl_dso_flag,                               \
+                     paddle::platform::dynload::GetNCCLDsoHandle, \
+                     &nccl_dso_handle);                           \
+      void* p_##__name = dlsym(nccl_dso_handle, #__name);         \
+      return reinterpret_cast<nccl_func>(p_##__name)(args...);    \
+    }                                                             \
+  };                                                              \
+  extern DynLoad__##__name __name
+#else
+#define DECLARE_DYNAMIC_LOAD_NCCL_WRAP(__name) \
+  struct DynLoad__##__name {                   \
+    template <typename... Args>                \
+    ncclResult_t operator()(Args... args) {    \
+      return __name(args...);                  \
+    }                                          \
+  };                                           \
+  extern DynLoad__##__name __name
+#endif
+
+#define NCCL_RAND_ROUTINE_EACH(__macro) \
+  __macro(ncclCommInitAll);             \
+  __macro(ncclGetUniqueId);             \
+  __macro(ncclCommInitRank);            \
+  __macro(ncclCommDestroy);             \
+  __macro(ncclCommCount);               \
+  __macro(ncclCommCuDevice);            \
+  __macro(ncclCommUserRank);            \
+  __macro(ncclAllReduce);               \
+  __macro(ncclBcast);                   \
+  __macro(ncclAllGather);               \
+  __macro(ncclReduce);                  \
+  __macro(ncclGetErrorString);
+
+NCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NCCL_WRAP)
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index b523ef03c0053622bfda5b4bf07515c1b480b4af..bfe708748a62ff9ac5d151bc652142e1f4925c83 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -29,11 +29,14 @@ limitations under the License. */
 #include <cxxabi.h>  // for __cxa_demangle
 #endif
 
-#ifndef PADDLE_ONLY_CPU
+#include <glog/logging.h>
+
+#ifdef PADDLE_WITH_CUDA
 
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/dynload/nccl.h"
 
 #include <cublas_v2.h>
 #include <cudnn.h>
@@ -41,7 +44,7 @@ limitations under the License. */
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
 
-#endif  // PADDLE_ONLY_CPU
+#endif
 
 namespace paddle {
 namespace platform {
@@ -113,7 +116,7 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   }
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
@@ -172,6 +175,17 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   throw std::runtime_error(err + string::Sprintf(args...));
 }
 
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    ncclResult_t stat, const Args&... args) {
+  if (stat == ncclSuccess) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::ncclGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
 #endif  // PADDLE_ONLY_CPU
 
 template <typename T>
@@ -185,7 +199,7 @@ inline void throw_on_error(T e) {
         std::make_exception_ptr(                                       \
             std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
         __FILE__, __LINE__);                                           \
-  } while (0)
+  } while (false)
 
 #define PADDLE_ENFORCE(...)                                             \
   do {                                                                  \
@@ -195,7 +209,7 @@ inline void throw_on_error(T e) {
       throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
                                               __FILE__, __LINE__);      \
     }                                                                   \
-  } while (0)
+  } while (false)
 
 /*
  * Some enforce helpers here, usage:
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index 80bdee3d9dfbe38ef707a6ba60cdb7f7b99714de..8206a055eabf4abf584962e921610d5029e2f571 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -213,4 +213,4 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
 TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
   Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
   ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
-}
\ No newline at end of file
+}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
index be381a4e26cf0eb41f5b3de88bd03ad8901683cc..0cab5ffc5609bbd6fd08c74329d8370fb95f8102 100644
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -26,11 +26,11 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
 namespace paddle {
 namespace platform {
 
-int GetDeviceCount() {
+int GetCUDADeviceCount() {
   int count;
   PADDLE_ENFORCE(
       cudaGetDeviceCount(&count),
-      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+      "cudaGetDeviceCount failed in paddle::platform::GetCUDADeviceCount");
   return count;
 }
 
@@ -43,6 +43,8 @@ int GetCurrentDeviceId() {
 }
 
 void SetDeviceId(int id) {
+  // TODO(qijun): find a better way to cache the cuda device count
+  PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
   PADDLE_ENFORCE(cudaSetDevice(id),
                  "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
index f0c825bd9b0bc41396b8fdb95f0b4337cbe3db02..37665b97d764fbcfe0964127d230b1d28d90b687 100644
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 #include <cuda_runtime.h>
 #include <stddef.h>
@@ -28,7 +28,7 @@ const std::string kEnvFractionGpuMemoryToUse =
     "PADDLE_FRACTION_GPU_MEMORY_TO_USE";
 
 //! Get the total number of GPU devices in system.
-int GetDeviceCount();
+int GetCUDADeviceCount();
 
 //! Get the current GPU device id in system.
 int GetCurrentDeviceId();
@@ -63,4 +63,4 @@ void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
 }  // namespace platform
 }  // namespace paddle
 
-#endif  // PADDLE_ONLY_CPU
+#endif
diff --git a/paddle/platform/hostdevice.h b/paddle/platform/hostdevice.h
index e7de86b7b2f75d206e730ec409bbee5d0a08942e..eb2df291cceef553d6422e6166e1fef2c63e2a47 100644
--- a/paddle/platform/hostdevice.h
+++ b/paddle/platform/hostdevice.h
@@ -2,8 +2,10 @@
 
 #ifdef __CUDACC__
 #define HOSTDEVICE __host__ __device__
+#define DEVICE __device__
 #define HOST __host__
 #else
 #define HOSTDEVICE
+#define DEVICE
 #define HOST
 #endif
diff --git a/paddle/platform/macros.h b/paddle/platform/macros.h
index 4a04a38c0c6f905639004dea2f4416ecc57c8620..feae7bdd77e3a0d02f33fb33991648408f542d0e 100644
--- a/paddle/platform/macros.h
+++ b/paddle/platform/macros.h
@@ -16,8 +16,10 @@ limitations under the License. */
 
 // Disable the copy and assignment operator for a class.
 #ifndef DISABLE_COPY_AND_ASSIGN
-#define DISABLE_COPY_AND_ASSIGN(classname) \
- private:                                  \
-  classname(const classname&) = delete;    \
-  classname& operator=(const classname&) = delete
+#define DISABLE_COPY_AND_ASSIGN(classname)         \
+ private:                                          \
+  classname(const classname&) = delete;            \
+  classname(const classname&&) = delete;           \
+  classname& operator=(const classname&) = delete; \
+  classname& operator=(const classname&&) = delete
 #endif
diff --git a/paddle/platform/nccl_test.cu b/paddle/platform/nccl_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ab8b96f7263aed83407866fedf9e529ce0affe3f
--- /dev/null
+++ b/paddle/platform/nccl_test.cu
@@ -0,0 +1,139 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/dynload/nccl.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
+
+#include <thrust/device_vector.h>
+#include <memory>
+#include <vector>
+
+static int dev_count = 0;
+
+namespace paddle {
+namespace platform {
+
+TEST(NCCL, init) {
+  std::vector<ncclComm_t> comms;
+  comms.resize(dev_count);
+
+  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  PADDLE_ENFORCE(status);
+  for (int i = 0; i < dev_count; ++i) {
+    dynload::ncclCommDestroy(comms[i]);
+  }
+}
+
+template <typename T>
+struct PerThreadData {
+  thrust::device_vector<T> send_buff;
+  thrust::device_vector<T> recv_buff;
+  CUDADeviceContext dev_ctx;
+
+  T* SendBuff() { return thrust::raw_pointer_cast(send_buff.data()); }
+
+  T* RecvBuff() { return thrust::raw_pointer_cast(recv_buff.data()); }
+
+  PerThreadData(int gpu_id, size_t size) : dev_ctx(GPUPlace(gpu_id)) {
+    send_buff.resize(size);
+    for (size_t i = 0; i < size; ++i) {
+      send_buff[i] = static_cast<T>(i);
+    }
+    recv_buff.resize(size);
+  }
+};
+
+static constexpr int ELEM_COUNT = 10000;
+
+TEST(NCCL, all_reduce) {
+  std::vector<ncclComm_t> comms;
+  comms.resize(dev_count);
+  VLOG(1) << "Initializing ncclComm";
+  auto status = dynload::ncclCommInitAll(comms.data(), dev_count, nullptr);
+  PADDLE_ENFORCE(status);
+  VLOG(1) << "ncclComm initialized";
+  VLOG(1) << "Creating thread data";
+  std::vector<std::unique_ptr<PerThreadData<double>>> data;
+  data.reserve(dev_count);
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Creating thread data for device " << i;
+    SetDeviceId(i);
+    data.emplace_back(new PerThreadData<double>(i, ELEM_COUNT));
+  }
+  VLOG(1) << "Thread data created";
+
+  VLOG(1) << "Check send_buf data";
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Check on device " << i;
+    SetDeviceId(i);
+    thrust::host_vector<double> tmp = data[i]->send_buff;
+    for (size_t j = 0; j < tmp.size(); ++j) {
+      ASSERT_NEAR(static_cast<double>(j), tmp[j], 1e-5);
+    }
+  }
+
+  VLOG(1) << "Invoking ncclAllReduce";
+
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Invoking ncclAllReduce with device " << i;
+    SetDeviceId(i);
+    PADDLE_ENFORCE(dynload::ncclAllReduce(
+        data[i]->SendBuff(), data[i]->RecvBuff(), ELEM_COUNT, ncclDouble,
+        ncclSum, comms[i], data[i]->dev_ctx.stream()));
+    VLOG(1) << "Invoked ncclAllReduce for device " << i;
+  }
+
+  VLOG(1) << "Invoked ncclAllReduce";
+
+  VLOG(1) << "Sync devices";
+  for (int i = 0; i < dev_count; ++i) {
+    VLOG(1) << "Sync device " << i;
+    SetDeviceId(i);
+    data[i]->dev_ctx.Wait();
+  }
+  VLOG(1) << "device synced";
+
+  for (int i = 0; i < dev_count; ++i) {
+    SetDeviceId(i);
+    VLOG(1) << "Checking vector on device " << i;
+    thrust::host_vector<double> tmp = data[i]->recv_buff;
+    for (size_t j = 0; j < tmp.size(); ++j) {
+      auto elem = static_cast<double>(j);
+      elem *= dev_count;
+      ASSERT_NEAR(tmp[j], elem, 1e-4);
+    }
+  }
+
+  for (int i = 0; i < dev_count; ++i) {
+    dynload::ncclCommDestroy(comms[i]);
+  }
+}
+}  // namespace platform
+}  // namespace paddle
+
+int main(int argc, char** argv) {
+  dev_count = paddle::platform::GetCUDADeviceCount();
+  if (dev_count <= 1) {
+    LOG(WARNING)
+        << "Cannot test multi-gpu nccl, because the CUDA device count is "
+        << dev_count;
+    return 0;
+  }
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index b31515e1f028acac885a506ff1c20479407a05e3..856e54df89c1c18ade040957188a2fbda0901473 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -47,7 +47,7 @@ bool is_cpu_place(const Place &p) {
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
-  return is_gpu_place(p1) == is_gpu_place(p2);
+  return p1.which() == p2.which();
 }
 
 std::ostream &operator<<(std::ostream &os, const Place &p) {
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 1117476bb37f1b0f3876c55e610803d5ee2558ce..5370360a7de26e409a1545182a12d3df1f37658b 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <iostream>
+
 #include "paddle/platform/variant.h"
 
 namespace paddle {
@@ -34,6 +35,7 @@ struct GPUPlace {
   GPUPlace() : GPUPlace(0) {}
   explicit GPUPlace(int d) : device(d) {}
 
+  inline int GetDeviceId() const { return device; }
   // needed for variant equality comparison
   inline bool operator==(const GPUPlace &o) const { return device == o.device; }
   inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
@@ -46,8 +48,18 @@ struct IsGPUPlace : public boost::static_visitor<bool> {
   bool operator()(const GPUPlace &gpu) const { return true; }
 };
 
+// Define the max number of Place in bit length. i.e., the max number of places
+// should be less equal than 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
+#define NUM_PLACE_TYPE_LIMIT_IN_BIT 4
+
 typedef boost::variant<GPUPlace, CPUPlace> Place;
 
+// static check number of place types is less equal than
+// 2^(NUM_PLACE_TYPE_LIMIT_IN_BIT)
+BOOST_MPL_ASSERT((boost::mpl::less_equal<
+                  Place::types::size,
+                  boost::mpl::long_<1 << NUM_PLACE_TYPE_LIMIT_IN_BIT>>));
+
 void set_place(const Place &);
 const Place &get_place();
 
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
index c2257af1b5dd1a1e284979bf17e1a947072baa85..619897ca19eb2e6f4dbfd9160edf8c4bc58c89a9 100644
--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
@@ -16,7 +16,7 @@
 
 #include <boost/config.hpp>
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 
 // Because boost's variadic templates has bug on nvcc, boost will disable
 // variadic template support when GPU enabled on nvcc.
@@ -29,4 +29,6 @@
 #endif
 #endif
 
+#include <boost/mpl/comparison.hpp>
+#include <boost/mpl/less_equal.hpp>
 #include <boost/variant.hpp>
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 4203f2616456244df616ee2109436ab7caef9741..0e8e5a83a47bee3436450e6bf7db5e26dc037016 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -49,6 +49,11 @@ DEFINE_int32(sock_recv_buf_size,
              1024 * 1024 * 40,
              "restrict sock recv buff size");
 
+/// reasonable sock_listen_queue_size can control maximum pending connections.
+DEFINE_int32(sock_listen_queue_size,
+             1024,
+             "listen queue size when pserver listen a TCP port");
+
 namespace paddle {
 
 /**
@@ -129,7 +134,7 @@ SocketServer::SocketServer(const std::string &addr, int port, int rdmaCpu)
   if (rdmaCpu == -1) {
     tcpRdma_ = F_TCP;
     socket_ = 0;
-    maxPendingConnections_ = 100;
+    maxPendingConnections_ = FLAGS_sock_listen_queue_size;
   } else {
     tcpRdma_ = F_RDMA;
     rdmaCpu_ = rdmaCpu;
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 6f6c9e596cfb7a2547d5b6c5de69381eb9c29132..b43461d61bab21747e85090bbf7af21a87a670c6 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -215,7 +215,7 @@ int main(int argc, char** argv) {
 
   uint64_t dataSize = FLAGS_dim * sizeof(real);
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   GpuVector gpuParam(FLAGS_dim);
   GpuVector gpuGrad(FLAGS_dim);
 #else
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 04236fda2fb62b928b5c06ff38acfd3eb7217b08..ad8ffed9c1c8e4bdef27689ab21950db6b5cf0a2 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -99,7 +99,7 @@ TEST(ProtoServer, regular) {
 }
 
 TEST(ProtoServer, extended) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   ProtoClient* client;
   if (FLAGS_rdma_tcp == "rdma")
     client = new ProtoClient(FLAGS_server_addr, FLAGS_port, F_RDMA);
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 18ecbd1aa34c82d63ae7f8ec1bd8f81b35eee30b..d7cd738828a10b431370c92026b89d62add1275e 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,6 +1,8 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc
+    DEPS pybind python backward proto_desc tensor_array paddle_memory executor
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
+
+cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array)
diff --git a/paddle/pybind/print_operators_doc.cc b/paddle/pybind/print_operators_doc.cc
new file mode 100644
index 0000000000000000000000000000000000000000..24f2a9383f7a069f1a8c7ed2bf3da46720470efa
--- /dev/null
+++ b/paddle/pybind/print_operators_doc.cc
@@ -0,0 +1,132 @@
+#include <iostream>
+#include <sstream>  // std::stringstream
+#include <string>
+
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/pybind/pybind.h"
+
+std::string Escape(const std::string& s) {
+  std::string r;
+  for (size_t i = 0; i < s.size(); i++) {
+    switch (s[i]) {
+      case '\"':
+        r += "\\\"";
+        break;
+      case '\\':
+        r += "\\\\";
+        break;
+      case '\n':
+        r += "\\n";
+        break;
+      case '\t':
+        r += "\\t";
+      case '\r':
+        break;
+      default:
+        r += s[i];
+        break;
+    }
+  }
+  return r;
+}
+
+std::string AttrType(paddle::framework::AttrType at) {
+  switch (at) {
+    case paddle::framework::INT:
+      return "int";
+    case paddle::framework::FLOAT:
+      return "float";
+    case paddle::framework::STRING:
+      return "string";
+    case paddle::framework::BOOLEAN:
+      return "bool";
+    case paddle::framework::INTS:
+      return "int array";
+    case paddle::framework::FLOATS:
+      return "float array";
+    case paddle::framework::STRINGS:
+      return "string array";
+    case paddle::framework::BOOLEANS:
+      return "bool array";
+    case paddle::framework::BLOCK:
+      return "block id";
+  }
+  return "UNKNOWN";  // not possible
+}
+
+void PrintVar(const paddle::framework::OpProto::Var& v, std::stringstream& ss) {
+  ss << " { "
+     << "\n"
+     << "   \"name\" : \"" << Escape(v.name()) << "\",\n"
+     << "   \"comment\" : \"" << Escape(v.comment()) << "\",\n"
+     << "   \"duplicable\" : " << v.duplicable() << ",\n"
+     << "   \"intermediate\" : " << v.intermediate() << "\n"
+     << " },";
+}
+
+void PrintAttr(const paddle::framework::OpProto::Attr& a,
+               std::stringstream& ss) {
+  ss << " { "
+     << "\n"
+     << "   \"name\" : \"" << Escape(a.name()) << "\",\n"
+     << "   \"type\" : \"" << AttrType(a.type()) << "\",\n"
+     << "   \"comment\" : \"" << Escape(a.comment()) << "\",\n"
+     << "   \"generated\" : " << a.generated() << "\n"
+     << " },";
+}
+
+void PrintOpProto(const std::string& type,
+                  const paddle::framework::OpInfo& opinfo,
+                  std::stringstream& ss) {
+  std::cerr << "Processing " << type << "\n";
+
+  const paddle::framework::OpProto* p = opinfo.proto_;
+  if (p == nullptr) {
+    return;  // It is possible that an operator doesn't have OpProto.
+  }
+
+  ss << "{\n"
+     << " \"type\" : \"" << Escape(p->type()) << "\",\n"
+     << " \"comment\" : \"" << Escape(p->comment()) << "\",\n";
+
+  ss << " \"inputs\" : [ "
+     << "\n";
+  for (int i = 0; i < p->inputs_size(); i++) {
+    PrintVar(p->inputs(i), ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << " ], "
+     << "\n";
+
+  ss << " \"outputs\" : [ "
+     << "\n";
+  for (int i = 0; i < p->outputs_size(); i++) {
+    PrintVar(p->outputs(i), ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << " ], "
+     << "\n";
+
+  ss << " \"attrs\" : [ "
+     << "\n";
+  for (int i = 0; i < p->attrs_size(); i++) {
+    PrintAttr(p->attrs(i), ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << " ] "
+     << "\n";
+
+  ss << "},";
+}
+
+int main() {
+  std::stringstream ss;
+  ss << "[\n";
+  for (auto& iter : paddle::framework::OpInfoMap::Instance().map()) {
+    PrintOpProto(iter.first, iter.second, ss);
+  }
+  ss.seekp(-1, ss.cur);  // remove the trailing comma
+  ss << "]\n";
+  std::cout << ss.str();
+}
diff --git a/paddle/pybind/protobuf.cc b/paddle/pybind/protobuf.cc
index 218821b35bb6947181fedc56e002ad0285f6307d..6bf6eb9fd404a7fa16f2b169dd18f34f0a4e324c 100644
--- a/paddle/pybind/protobuf.cc
+++ b/paddle/pybind/protobuf.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"
 #include <deque>
 #include <iostream>
+#include "paddle/framework/backward.h"
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_desc.h"
 #include "paddle/framework/program_desc.h"
@@ -99,26 +100,43 @@ using namespace paddle::framework;  // NOLINT
 // Bind Methods
 void BindProgramDesc(py::module &m) {
   py::class_<ProgramDescBind>(m, "ProgramDesc", "")
-      .def_static("instance",
-                  []() -> ProgramDescBind * {
-                    return &ProgramDescBind::Instance(&GetProgramDesc());
-                  },
-                  py::return_value_policy::reference)
-      .def_static("__create_program_desc__",
-                  []() -> ProgramDescBind * {
-                    // Only used for unit-test
-                    auto *prog_desc = new ProgramDesc;
-                    auto *block = prog_desc->mutable_blocks()->Add();
-                    block->set_idx(0);
-                    block->set_parent_idx(-1);
-                    return &ProgramDescBind::Instance(prog_desc);
-                  },
-                  py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("__init__",
+           [](ProgramDescBind &self, const ProgramDescBind &other) {
+             new (&self) ProgramDescBind(other);
+           })
       .def("append_block", &ProgramDescBind::AppendBlock,
            py::return_value_policy::reference)
+      .def("append_backward",
+           [](ProgramDescBind &program_desc, const VarDescBind &target,
+              const std::unordered_set<std::string> &no_grad_vars) {
+             ParamGradInfoMap param_grad_map =
+                 AppendBackward(program_desc, target, no_grad_vars);
+             std::unordered_map<
+                 std::string, std::tuple<std::string /* grad_var_name */,
+                                         int /* block_idx */, int /* op_idx */>>
+                 retv;
+             for (auto it = param_grad_map.begin(); it != param_grad_map.end();
+                  ++it) {
+               const auto &grad_info = it->second;
+               retv[it->first] = std::make_tuple(
+                   grad_info.name_, grad_info.block_idx_, grad_info.op_idx_);
+             }
+             return retv;
+           })
       .def("block", &ProgramDescBind::Block, py::return_value_policy::reference)
-      .def("__str__", &ProgramDescBind::DebugString)
-      .def("num_blocks", &ProgramDescBind::Size);
+      .def("num_blocks", &ProgramDescBind::Size)
+      .def("serialize_to_string",
+           [](ProgramDescBind &program_desc) -> py::bytes {
+             const ProgramDesc *desc = program_desc.Proto();
+             PADDLE_ENFORCE(desc->IsInitialized(),
+                            "ProgramDesc has not been initialized.");
+             std::string res;
+             PADDLE_ENFORCE(
+                 desc->SerializeToString(&res),
+                 "Serialize ProgramDesc Error. This could be a bug of Paddle.");
+             return res;
+           });
 }
 
 void BindBlockDesc(py::module &m) {
@@ -129,22 +147,37 @@ void BindBlockDesc(py::module &m) {
            py::return_value_policy::reference)
       .def("prepend_op", &BlockDescBind::PrependOp,
            py::return_value_policy::reference)
-      .def("new_var",
+      .def("var",
            [](BlockDescBind &self, py::bytes byte_name) {
              std::string name = byte_name;
-             return self.NewVar(name);
+             return self.Var(name);
            },
            py::return_value_policy::reference)
-      .def("var",
+      .def("has_var",
            [](BlockDescBind &self, py::bytes byte_name) {
              std::string name = byte_name;
-             return self.Var(name);
+             return self.HasVar(name);
+           })
+      .def("find_var",
+           [](BlockDescBind &self, py::bytes byte_name) {
+             std::string name = byte_name;
+             return self.FindVar(name);
            },
            py::return_value_policy::reference)
       .def("all_vars", &BlockDescBind::AllVars,
            py::return_value_policy::reference)
-      .def("all_ops", &BlockDescBind::AllOps,
-           py::return_value_policy::reference);
+      .def("op_size", &BlockDescBind::OpSize)
+      .def("op", &BlockDescBind::Op, py::return_value_policy::reference)
+      .def("serialize_to_string", [](BlockDescBind &block_desc) -> py::bytes {
+        const BlockDesc *desc = block_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "BlockDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize BlockDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
 }
 
 void BindVarDsec(py::module &m) {
@@ -157,7 +190,8 @@ void BindVarDsec(py::module &m) {
       .value("FP32", DataType::FP32)
       .value("FP64", DataType::FP64);
 
-  py::class_<VarDescBind>(m, "VarDesc", "")
+  py::class_<VarDescBind> var_desc(m, "VarDesc", "");
+  var_desc
       .def("name",
            [](const VarDescBind &self) {
              py::bytes name = self.Name();
@@ -167,7 +201,31 @@ void BindVarDsec(py::module &m) {
       .def("set_shape", &VarDescBind::SetShape)
       .def("set_data_type", &VarDescBind::SetDataType)
       .def("shape", &VarDescBind::Shape, py::return_value_policy::reference)
-      .def("data_type", &VarDescBind::GetDataType);
+      .def("data_type", &VarDescBind::GetDataType)
+      .def("lod_level", &VarDescBind::GetLodLevel)
+      .def("set_lod_level", &VarDescBind::SetLoDLevel)
+      .def("type", &VarDescBind::GetType)
+      .def("set_type", &VarDescBind::SetType)
+      .def("serialize_to_string",
+           [](VarDescBind &var_desc) -> py::bytes {
+             const VarDesc *desc = var_desc.Proto();
+             PADDLE_ENFORCE(desc->IsInitialized(),
+                            "VarDesc has not been initialized.");
+             std::string res;
+             PADDLE_ENFORCE(
+                 desc->SerializeToString(&res),
+                 "Serialize VarDesc Error. This could be a bug of Paddle.");
+             return res;
+           })
+      .def("persistable", &VarDescBind::Persistable)
+      .def("set_persistable", &VarDescBind::SetPersistable);
+
+  py::enum_<VarDesc::VarType>(var_desc, "VarType", "")
+      .value("LOD_TENSOR", VarDesc::LOD_TENSOR)
+      .value("SELECTED_ROWS", VarDesc::SELECTED_ROWS)
+      .value("FEED_MINIBATCH", VarDesc::FEED_MINIBATCH)
+      .value("FETCH_LIST", VarDesc::FETCH_LIST)
+      .value("STEP_SCOPES", VarDesc::STEP_SCOPES);
 }
 
 void BindOpDesc(py::module &m) {
@@ -191,15 +249,26 @@ void BindOpDesc(py::module &m) {
       .def("output", &OpDescBind::Output)
       .def("output_names", &OpDescBind::OutputNames)
       .def("set_output", &OpDescBind::SetOutput)
-      .def("__str__", &OpDescBind::DebugString)
-      .def("__repr__", &OpDescBind::DebugString)
       .def("has_attr", &OpDescBind::HasAttr)
       .def("attr_type", &OpDescBind::GetAttrType)
       .def("attr_names", &OpDescBind::AttrNames)
       .def("set_attr", &OpDescBind::SetAttr)
       .def("attr", &OpDescBind::GetAttr)
       .def("set_block_attr", &OpDescBind::SetBlockAttr)
-      .def("get_block_attr", &OpDescBind::GetBlockAttr);
+      .def("block_attr", &OpDescBind::GetBlockAttr)
+      .def("check_attrs", &OpDescBind::CheckAttrs)
+      .def("infer_shape", &OpDescBind::InferShape)
+      .def("infer_var_type", &OpDescBind::InferVarType)
+      .def("serialize_to_string", [](OpDescBind &op_desc) -> py::bytes {
+        const OpDesc *desc = op_desc.Proto();
+        PADDLE_ENFORCE(desc->IsInitialized(),
+                       "OpDesc has not been initialized.");
+        std::string res;
+        PADDLE_ENFORCE(
+            desc->SerializeToString(&res),
+            "Serialize OpDesc Error. This could be a bug of Paddle.");
+        return res;
+      });
 }
 
 }  // namespace pybind
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index d85bf6c7faa5f65c7b39682f7639fe269bdfa345..b6e44fdbad6e2817e3077901f58177adc4bb0c71 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,8 +15,14 @@ limitations under the License. */
 #include "paddle/pybind/protobuf.h"
 
 #include "paddle/framework/backward.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
+#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/operators/recurrent_op.h"
 #include "paddle/platform/enforce.h"
@@ -34,7 +40,7 @@ static size_t UniqueIntegerGenerator() {
 }
 
 bool IsCompileGPU() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
@@ -77,20 +83,20 @@ PYBIND11_PLUGIN(core) {
            })
       .def("set", PyCPUTensorSetFromArray<float>)
       .def("set", PyCPUTensorSetFromArray<int>)
-#ifndef PADDLE_ONLY_CPU
+      .def("set", PyCPUTensorSetFromArray<double>)
+      .def("set", PyCPUTensorSetFromArray<int64_t>)
+#ifdef PADDLE_WITH_CUDA
       .def("set", PyCUDATensorSetFromArray<float>)
       .def("set", PyCUDATensorSetFromArray<int>)
+      .def("set", PyCUDATensorSetFromArray<double>)
+      .def("set", PyCUDATensorSetFromArray<int64_t>)
 #endif
       .def("shape", [](Tensor &self) { return vectorize(self.dims()); })
-      .def("set_float_element",
-           [](Tensor &self, size_t offset, float f) {
-             // TODO(yuyang18): Only support GPU now.
-             self.data<float>()[offset] = f;
-           })
-      .def("get_float_element", [](Tensor &self, size_t offset) -> float {
-        // TODO(yuyang18): Only support GPU now.
-        return self.data<float>()[offset];
-      });
+      .def("set_float_element", TensorSetElement<float>)
+      .def("get_float_element", TensorGetElement<float>)
+      .def("set_double_element", TensorSetElement<double>)
+      .def("get_double_element", TensorGetElement<double>)
+      .def("dtype", [](Tensor &self) { return ToDataType(self.type()); });
 
   py::class_<LoDTensor, Tensor>(m, "LoDTensor")
       .def_buffer(
@@ -98,7 +104,7 @@ PYBIND11_PLUGIN(core) {
       .def(
           "__init__",
           [](LoDTensor &instance, const std::vector<std::vector<size_t>> &lod) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
             new (&instance) LoDTensor(lod);
 #else
              LoD new_lod;
@@ -107,9 +113,10 @@ PYBIND11_PLUGIN(core) {
              new (&instance) LoDTensor(new_lod);
 #endif
           })
+      .def("__init__", [](LoDTensor &instance) { new (&instance) LoDTensor(); })
       .def("set_lod",
            [](LoDTensor &self, const std::vector<std::vector<size_t>> &lod) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
              self.set_lod(lod);
 #else
              LoD new_lod;
@@ -119,7 +126,7 @@ PYBIND11_PLUGIN(core) {
 #endif
            })
       .def("lod", [](LoDTensor &self) -> std::vector<std::vector<size_t>> {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
         return self.lod();
 #else
            auto lod = self.lod();
@@ -137,6 +144,40 @@ PYBIND11_PLUGIN(core) {
 #endif
       });
 
+  py::class_<SelectedRows>(m, "SelectedRows")
+      .def("__init__",
+           [](SelectedRows &instance) { new (&instance) SelectedRows(); })
+      .def("__init__",
+           [](SelectedRows &instance, const std::vector<int64_t> rows,
+              const int64_t &height) {
+             new (&instance) SelectedRows(rows, height);
+           })
+      .def("get_tensor",
+           [](SelectedRows &self) { return self.mutable_value(); },
+           py::return_value_policy::reference)
+      .def("set_height", &SelectedRows::set_height)
+      .def("height", &SelectedRows::height)
+      .def("set_rows",
+           [](SelectedRows &self, std::vector<int64_t> rows) {
+#ifndef PADDLE_WITH_CUDA
+             self.set_rows(rows);
+#else
+        Vector<int64_t> new_rows(rows);
+        self.set_rows(new_rows);
+#endif
+           })
+      .def("rows", [](SelectedRows &self) {
+#ifndef PADDLE_WITH_CUDA
+        return self.rows();
+#else
+         auto rows = self.rows();
+         std::vector<int64_t> new_rows;
+         new_rows.reserve(rows.size());
+         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
+         return new_rows;
+#endif
+      });
+
   py::class_<Variable>(m, "Variable", R"DOC(Variable Class.
 
 All parameter, weight, gradient are variables in Paddle.
@@ -145,11 +186,23 @@ All parameter, weight, gradient are variables in Paddle.
       .def("set_int",
            [](Variable &var, int val) -> void { *var.GetMutable<int>() = val; })
       .def("get_int", [](const Variable &var) -> int { return var.Get<int>(); })
+      .def("is_float", [](const Variable &var) { return var.IsType<float>(); })
+      .def("set_float",
+           [](Variable &var, float val) -> void {
+             *var.GetMutable<float>() = val;
+           })
+      .def("get_float",
+           [](const Variable &var) -> float { return var.Get<float>(); })
       .def("get_tensor",
            [](Variable &self) -> LoDTensor * {
              return self.GetMutable<LoDTensor>();
            },
            py::return_value_policy::reference)
+      .def("get_selected_rows",
+           [](Variable &self) -> SelectedRows * {
+             return self.GetMutable<SelectedRows>();
+           },
+           py::return_value_policy::reference)
       .def("get_net",
            [](Variable &self) -> operators::NetOp * {
              return self.GetMutable<operators::NetOp>();
@@ -157,9 +210,9 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   py::class_<Scope>(m, "Scope", "")
-      .def("new_var",
+      .def("var",
            [](Scope &self, const std::string &name) -> Variable * {
-             return self.NewVar(name);
+             return self.Var(name);
            },
            py::return_value_policy::reference)
       .def("find_var", &Scope::FindVar, py::return_value_policy::reference)
@@ -172,15 +225,16 @@ All parameter, weight, gradient are variables in Paddle.
   //! Python str. If you want a str object, you should cast them in Python.
   m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
     std::vector<py::bytes> ret_values;
-
-    OpInfoMap::Instance().IterAllInfo([&ret_values](const std::string &type,
-                                                    const OpInfo &info) {
-      if (!info.HasOpProtoAndChecker()) return;
-      std::string str;
-      PADDLE_ENFORCE(info.Proto().SerializeToString(&str),
-                     "Serialize OpProto Error. This could be a bug of Paddle.");
-      ret_values.emplace_back(str);
-    });
+    for (auto &iter : OpInfoMap::Instance().map()) {
+      auto &info = iter.second;
+      if (info.HasOpProtoAndChecker()) {
+        std::string str;
+        PADDLE_ENFORCE(
+            info.Proto().SerializeToString(&str),
+            "Serialize OpProto Error. This could be a bug of Paddle.");
+        ret_values.emplace_back(str);
+      }
+    }
     return ret_values;
   });
   m.def_submodule(
@@ -198,7 +252,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def_static("create",
                   [](paddle::platform::GPUPlace& place)
                       -> paddle::platform::DeviceContext* {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
                     PADDLE_THROW("GPUPlace is not supported in CPU device.");
 #else
                     return new paddle::platform::CUDADeviceContext(place);
@@ -214,6 +268,17 @@ All parameter, weight, gradient are variables in Paddle.
       .def(py::init<>())
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
+  py::class_<platform::Place>(m, "Place")
+      .def(py::init<>())
+      .def("set_place",
+           [](platform::Place &self, const platform::CPUPlace &cpu_place) {
+             self = cpu_place;
+           })
+      .def("set_place",
+           [](platform::Place &self, const platform::GPUPlace &gpu_place) {
+             self = gpu_place;
+           });
+
   py::class_<OperatorBase>(m, "Operator")
       .def_static("create",
                   [](py::bytes protobin) {
@@ -223,7 +288,7 @@ All parameter, weight, gradient are variables in Paddle.
                     PADDLE_ENFORCE(desc.IsInitialized(),
                                    "User OpDesc is not initialized, reason %s",
                                    desc.InitializationErrorString());
-                    return OpRegistry::CreateOp(desc);
+                    return OpRegistry::CreateOp(desc, nullptr);
                   })
       .def("backward",
            [](const OperatorBase &forwardOp,
@@ -266,6 +331,56 @@ All parameter, weight, gradient are variables in Paddle.
         self->CompleteAddOp();
       });
 
+  py::class_<framework::TensorArray>(m, "TensorArray")
+      .def("__init__",
+           [](TensorArray &instance) { new (&instance) TensorArray(); })
+      .def("read",
+           [](TensorArray &self, size_t index) { return self.Read(index); })
+      .def("write", [](TensorArray &self, size_t index,
+                       LoDTensor &value) { self.Write(index, value); })
+      .def("write_shared",
+           [](TensorArray &self, size_t index, const LoDTensor &value) {
+             self.WriteShared(index, value);
+           })
+      .def("size", [](TensorArray &self) { return self.size(); })
+      .def("pack",
+           [](TensorArray &self, size_t level,
+              const std::vector<std::vector<size_t>> &meta_info,
+              const std::vector<std::vector<size_t>> &lod) {
+             std::vector<DySeqMeta> meta;
+             for (auto &info : meta_info) {
+               PADDLE_ENFORCE_EQ(info.size(), 3UL);
+               meta.emplace_back(info[0], info[1], info[2]);
+             }
+#ifndef PADDLE_WITH_CUDA
+             return self.Pack(level, meta, lod);
+#else
+             LoD new_lod;
+             new_lod.reserve(lod.size());
+             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
+             return self.Pack(level, meta, new_lod);
+#endif
+           })
+      .def("unpack",
+           [](TensorArray &self, const LoDTensor &source, int level,
+              bool length_descend) {
+             auto metas = self.Unpack(source, level, length_descend);
+             std::vector<std::vector<size_t>> meta_info;
+             for (auto meta : metas) {
+               meta_info.emplace_back(
+                   std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
+             }
+             return meta_info;
+           })
+      .def("stack", [](TensorArray &self) { return self.Stack(); })
+      .def("unstack",
+           [](TensorArray &self, const LoDTensor &source) {
+             return self.Unstack(source);
+           })
+      .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
+        return self.UnstackShared(source);
+      });
+
   // recurrent_op
   py::class_<operators::RecurrentOp, OperatorBase>(m, "RecurrentOp")
       .def_static(
@@ -277,7 +392,7 @@ All parameter, weight, gradient are variables in Paddle.
             PADDLE_ENFORCE(desc.IsInitialized(),
                            "User OpDesc is not initialized, reason %s",
                            desc.InitializationErrorString());
-            auto rnn_op = OpRegistry::CreateOp(desc);
+            auto rnn_op = OpRegistry::CreateOp(desc, nullptr);
             return static_cast<operators::RecurrentOp *>(rnn_op.release());
           })
       .def("set_stepnet", [](operators::RecurrentOp &self,
@@ -285,6 +400,33 @@ All parameter, weight, gradient are variables in Paddle.
         self.set_stepnet(net.Clone());
       });
 
+  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
+                                                          "DynamicRecurrentOp")
+      .def_static("create",
+                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
+                    OpDesc desc;
+                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                                   "Cannot parse user input to OpDesc");
+                    PADDLE_ENFORCE(desc.IsInitialized(),
+                                   "User OpDesc is not initialized, reason %s",
+                                   desc.InitializationErrorString());
+                    auto rnn_op = OpRegistry::CreateOp(desc, nullptr);
+                    return static_cast<operators::DynamicRecurrentOp *>(
+                        rnn_op.release());
+                  })
+      .def("set_step_unit",
+           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
+               -> void { self.rnn.SetStepUnit(net.Clone()); })
+      .def("get_state",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.rnn.state(name); })
+      .def("get_step_input",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.rnn.step_input(name); })
+      .def("get_step_output",
+           [](operators::DynamicRecurrentOp &self, const std::string &name)
+               -> const TensorArray & { return self.rnn.step_output(name); });
+
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
@@ -295,7 +437,7 @@ All parameter, weight, gradient are variables in Paddle.
                     PADDLE_ENFORCE(desc.IsInitialized(),
                                    "User OpDesc is not initialized, reason %s",
                                    desc.InitializationErrorString());
-                    auto cond_op = OpRegistry::CreateOp(desc);
+                    auto cond_op = OpRegistry::CreateOp(desc, nullptr);
                     return static_cast<operators::CondOp *>(cond_op.release());
                   })
       .def("set_truenet",
@@ -307,15 +449,26 @@ All parameter, weight, gradient are variables in Paddle.
              self.set_falsenet(net.Clone());
            });
 
+  py::class_<framework::Executor>(m, "Executor")
+      .def(py::init<std::vector<platform::Place> &>())
+      .def("run", [](Executor &self, ProgramDescBind *program_bind,
+                     Scope *scope, int block_id) {
+        self.Run(*program_bind->Proto(), scope, block_id);
+      });
+
   m.def("unique_integer", UniqueIntegerGenerator);
 
   m.def("is_compile_gpu", IsCompileGPU);
+  m.def("set_feed_variable", framework::SetFeedVariable);
+  m.def("get_fetch_variable", framework::GetFetchVariable);
 
   BindProgramDesc(m);
   BindBlockDesc(m);
   BindVarDsec(m);
   BindOpDesc(m);
 
+  m.def("op_support_gpu", OpSupportGPU);
+
   return m.ptr();
 }
 }  // namespace pybind
diff --git a/paddle/pybind/tensor_py.h b/paddle/pybind/tensor_py.h
index f0d5a6f9ff963ecd80d0c261daff56bff50663d4..85f9f22733c97ef209e6c25dbcfbac492ac5c746 100644
--- a/paddle/pybind/tensor_py.h
+++ b/paddle/pybind/tensor_py.h
@@ -42,7 +42,7 @@ template <size_t I, typename... ARGS>
 struct CastToPyBufferImpl<true, I, ARGS...> {
   using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
   py::buffer_info operator()(framework::Tensor &tensor) {
-    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.type()) {
       auto dim_vec = framework::vectorize(tensor.dims());
       std::vector<size_t> dims_outside;
       std::vector<size_t> strides;
@@ -56,13 +56,24 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
         prod *= dims_outside[i - 1];
       }
       framework::Tensor dst_tensor;
-      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
-        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
-      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+      if (paddle::platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+        auto *src_ptr = static_cast<const void *>(tensor.data<CUR_TYPE>());
+        auto *dst_ptr = static_cast<void *>(dst_tensor.mutable_data<CUR_TYPE>(
+            tensor.dims(), platform::CPUPlace()));
+        // TODO(qijun): Here we use default CUDA stream to set GPU Tensor to
+        // a Python numpy array. It's better to manage CDUA stream unifiedly.
+        paddle::platform::GpuMemcpySync(dst_ptr, src_ptr,
+                                        sizeof(CUR_TYPE) * tensor.numel(),
+                                        cudaMemcpyDeviceToHost);
+#else
+        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#endif
+      } else if (paddle::platform::is_cpu_place(tensor.place())) {
         dst_tensor = tensor;
       }
       return py::buffer_info(
-          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.place()),
           sizeof(CUR_TYPE), py::format_descriptor<CUR_TYPE>::format(),
           (size_t)framework::arity(dst_tensor.dims()), dims_outside, strides);
     } else {
@@ -73,10 +84,23 @@ struct CastToPyBufferImpl<true, I, ARGS...> {
 };
 }  // namespace details
 inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
-  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  auto buffer_info =
+      details::CastToPyBufferImpl<true, 0, float, int, double>()(tensor);
   return buffer_info;
 }
 
+template <typename T>
+T TensorGetElement(framework::Tensor &self, size_t offset) {
+  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
+  return self.data<T>()[offset];
+}
+
+template <typename T>
+void TensorSetElement(framework::Tensor &self, size_t offset, T elem) {
+  PADDLE_ENFORCE(platform::is_cpu_place(self.place()));
+  self.data<T>()[offset] = elem;
+}
+
 template <typename T>
 void PyCPUTensorSetFromArray(
     framework::Tensor &self,
@@ -93,7 +117,7 @@ void PyCPUTensorSetFromArray(
   std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 template <typename T>
 void PyCUDATensorSetFromArray(
     framework::Tensor &self,
@@ -107,6 +131,8 @@ void PyCUDATensorSetFromArray(
 
   self.Resize(framework::make_ddim(dims));
   auto *dst = self.mutable_data<T>(place);
+  // TODO(qijun): Here we use default CUDA stream to set a Python numpy
+  // array to a GPU Tensor. It's better to manage CDUA stream unifiedly.
   paddle::platform::GpuMemcpySync(dst, array.data(), sizeof(T) * array.size(),
                                   cudaMemcpyHostToDevice);
 }
diff --git a/paddle/scripts/cluster_train_v2/fabric/conf.py b/paddle/scripts/cluster_train_v2/fabric/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96503d093a4317df7bb006043eb42098f51b6f5
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/conf.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+HOSTS = [
+    "root@10.1.9.7",
+    "root@10.1.18.7",
+    "root@10.1.32.9",
+]
+'''
+workspace configuration
+'''
+#root dir for workspace, can be set as any director with real user account
+ROOT_DIR = "/root"
+'''
+network configuration
+'''
+#pserver nics
+PADDLE_NIC = "eth0"
+#pserver port
+PADDLE_PORT = 7164
+#pserver ports num
+PADDLE_PORTS_NUM = 1
+#pserver sparse ports num
+PADDLE_PORTS_NUM_FOR_SPARSE = 1
+#trainer whether use gpu
+PADDLE_USE_GPU = "False"
+#environments setting for all processes in cluster job
+LD_LIBRARY_PATH = "/usr/local/cuda/lib64:/usr/lib64"
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6606c01265af1fa8009e67906a3dbbe5c95ebc0d
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/Dockerfile
@@ -0,0 +1,11 @@
+FROM docker.paddlepaddlehub.com/paddle:0.10.0rc2
+RUN apt-get update && apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+
+RUN echo 'root:root' |chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0784b2d1b8785796f94fff1607643218564fc126
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/docker_cluster/ssh_servers.yaml
@@ -0,0 +1,23 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: ssh-servers
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        app: ssh-servers
+    spec:
+      containers:
+      - name: ssh-servers
+        image: docker.paddlepaddlehub.com/paddlessh
+        resources:
+          limits:
+            cpu: 500m
+            memory: 1Gi
+          requests:
+            cpu: 500m
+            memory: 1Gi
+        ports:
+        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/fabric/run.sh b/paddle/scripts/cluster_train_v2/fabric/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..f6324bcb136803ebc30e69bcdaa2f8725cb0ccba
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/fabric/run.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+
+python paddle.py \
+  --job_dispatch_package="/root/wuyi/fabric_submit/workspace" \
+  --dot_period=10 \
+  --ports_num_for_sparse=1 \
+  --log_period=50 \
+  --num_passes=5 \
+  --trainer_count=2 \
+  --saving_period=1 \
+  --local=0 \
+  --config=./trainer_config.py \
+  --save_dir=./output \
+  --use_gpu=0
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..1a2d19e823541750830fcaa25f65b2f8e1ea2b49
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/Dockerfile
@@ -0,0 +1,43 @@
+# Build this image:  docker build -t mpi .
+#
+
+FROM paddledev/paddle:0.10.0rc3
+
+ENV DEBIAN_FRONTEND noninteractive
+
+RUN apt-get update -y && \
+    apt-get upgrade -y && \
+    apt-get install -y openssh-server zip unzip vim sudo \
+gcc gfortran openmpi-checkpoint binutils wget curl git openmpi-bin openmpi-common libopenmpi-dev && \
+pip install mpi4py numpy virtualenv scipy matplotlib lxml sqlalchemy suds ipython obspy && \
+mkdir /var/run/sshd && \
+echo 'root:tutorial' | chpasswd && \
+sed -i 's/PermitRootLogin without-password/PermitRootLogin yes/' /etc/ssh/sshd_config && \
+# SSH login fix. Otherwise user is kicked off after login
+sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd && \
+echo "export VISIBLE=now" >> /etc/profile && \
+adduser --disabled-password --gecos "" tutorial && \
+echo "tutorial ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers && \
+mkdir /home/tutorial/.ssh/
+
+ENV HOME /home/tutorial
+ENV NOTVISIBLE "in users profile"
+
+# ------------------------------------------------------------
+# Set-Up SSH with our Github deploy key
+# ------------------------------------------------------------
+
+ADD ssh/config /home/tutorial/.ssh/config
+ADD ssh/id_rsa.mpi /home/tutorial/.ssh/id_rsa
+ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/id_rsa.pub
+ADD ssh/id_rsa.mpi.pub /home/tutorial/.ssh/authorized_keys
+
+#---------------------------------------------------------------
+#LD_LIBRARY_PATH
+#---------------------------------------------------------------
+
+RUN export LD_LIBRARY_PATH=/usr/lib/openmpi/lib/
+
+WORKDIR /home/tutorial
+EXPOSE 22
+CMD ["/usr/sbin/sshd", "-D"]
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34835e5eb8d7cb92ad3cf7758a47c9e565a7dcf6
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/head.yaml
@@ -0,0 +1,25 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: mpi-header
+  labels:
+    app: mpi-header
+spec:
+  replicas: 1
+  template:
+    metadata:
+      labels:
+        app: mpi-header
+    spec:
+      containers:
+      - image: typhoon1986/paddle-openmpi
+        name : mpi-header
+        resources:
+          limits:
+            cpu: 500m
+            memory: 2Gi
+          requests:
+            cpu: 500m
+            memory: 2Gi
+        ports:
+        - containerPort: 22
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..2fd5cb4d44a25efac68dd8c9195dea9fd8f84a26
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/mpi-nodes.yaml
@@ -0,0 +1,26 @@
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+  name: mpi-nodes
+  labels:
+    app: mpi-nodes
+spec:
+  replicas: 3
+  template:
+    metadata:
+      labels:
+        app: mpi-nodes
+    spec:
+      containers:
+      - image: typhoon1986/paddle-openmpi
+        name : mpi-nodes
+        resources:
+          limits:
+            cpu: 500m
+            memory: 2Gi
+          requests:
+            cpu: 500m
+            memory: 2Gi
+        ports:
+        - containerPort: 22
+        imagePullPolicy: Always
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
new file mode 100644
index 0000000000000000000000000000000000000000..a9ecad07c39e4a9d6f0572d6cbf77795d99681f2
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/config
@@ -0,0 +1 @@
+StrictHostKeyChecking no
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
new file mode 100644
index 0000000000000000000000000000000000000000..23768343edf5258cf525523d471f67071a24f5de
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi
@@ -0,0 +1,27 @@
+-----BEGIN RSA PRIVATE KEY-----
+MIIEogIBAAKCAQEA7PWLZmgdJ508dD15T6+xqGDvL9Ehzo9SgsnN6xJ+qpUvvOi4
+1axW0AqR4MnPTg/uuvk+x4tUpuufOW4w22UTGjsdvmIVWa9ujLtcRiN3YPY+SU+Y
+O5FfqKg7r/hBn+/GMcSoffwSs7vVgmhBBnp/mJh2O1cOAFZEe98/47mbg3/kHBAk
+36NOQktaU3l48B38EhBTnjWfcEGm1HcTRPFxXV5Wiko6ZhKFEuHcTVKng4ROtUqE
+mgHyI0aB7TAxg4na0ejItsYWEPWGeDOw6ms/4MwylxNosWzHFPW9p4zgLCLNr+b6
+bDDfYKjXZflAuTQtQhLmJUwD9uuYLAijpSE2fQIDAQABAoIBADgcgRET8Gt0CV/B
+OtvKz/f+VEVvcWD3gWNlJDTZIVOFllNWjIZUlA4ZoqenQkbK8Q4nfV1FOht4yjCQ
+TlN1oMtiWk297i5Zo4UBzPzy4w774I39oh/g8dT/WXr2/5s+7SDV38xNh6Q2A34o
+79T35wUcfUrZ93/O7dKjb/6d8hx2FMha0wVKqY4lmG1lQE3bbx3kakec0PdvU5kO
+YHKlpqj3pMR7CpMa+4yL/iXFwWYmnK+uu+zw7JR7PwvH1CzrnvW438wjQ1QmYbSx
+mHHOE89X67Lsl5hn81qYWBhpwAlBwi1qscsE0cV9GcFyKqWFqZsj5coM9u3CRfvy
+lrWe1OUCgYEA+LBUFEd3Hxs4sFiYElJ8R9SAs1udaqPvAl01hTEijJLfYlMMVs/y
+rgNN7j22zjDak2f8QdyMJZX7EZdRmdYcHO0csYOwbYvalzcnwk+U3mxmdD3r4xSo
+DSvkJ70fogAqUlcVIg2re6fCmZVJQTvMQYTVEM8zQomJRt/Lb2esSfsCgYEA8+zv
+44aToe8uqiDs4w8guRW7LCDkTw4z4IVo9JUibIaPjaAs5bZEBXSB43EEywXCR75H
+fML0rU1PVvKh1rqcvZdVzm+XMWVr3asPk0sapaiHaTcmyZvJRDxxqbLFp0zRP1T6
+cCtXNFdHWU4KiuKrUi6cDyOKchpfkSZa4seiT+cCgYB+n4FgBfdQPlMB70oW4irn
+g/q32CjxuGCk6oKqu5bkzo+xB6obtavSEFqouIGQwO056tNVUY+GP7Rjg5GH663K
+yKw4cl3tmS0Gm43B8TVSfw03mKO3rrfWZQe5eCFYIg9qd26KNT2gK435FzsCXQkm
+PxUhhu6JrW/ZR2/U3Iur6wKBgADrWLAb1ryagSuE+j+U1AO+kDkHWrTtkcZ72jxp
+v3p3O11GSEUJXdJDcSXhTCpTuDq6/dv7hB6PFwh126RKicKxKlKf2wsFndV1Cpb8
+hnovW2tLGOtTmfuW2rrQAKyzvmolsNfxYd/BoHQ2thV16z1hDZeFA8WQUeHjKh6G
+sBbrAoGATdtQlaUxx4izua6k02ihkxx/cRYwDl2N8UDvDBHokS7vJFMX8b8NpsGg
+zMElnqSpu/pe/0UG7N2MtPF6uyMcX8AZzzcsRkiMkDvWJzYt8Jpf+Eyd/uryF+Yv
+yrXaOEY83tm6x/fny5ZaZmk8lNth7bfWywuTMkZLX3fYpWtIeE4=
+-----END RSA PRIVATE KEY-----
diff --git a/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
new file mode 100644
index 0000000000000000000000000000000000000000..015f2b42e71920e00de090cbb1108d9a12ed5f0c
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/docker_cluster/ssh/id_rsa.mpi.pub
@@ -0,0 +1 @@
+ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDs9YtmaB0nnTx0PXlPr7GoYO8v0SHOj1KCyc3rEn6qlS+86LjVrFbQCpHgyc9OD+66+T7Hi1Sm6585bjDbZRMaOx2+YhVZr26Mu1xGI3dg9j5JT5g7kV+oqDuv+EGf78YxxKh9/BKzu9WCaEEGen+YmHY7Vw4AVkR73z/juZuDf+QcECTfo05CS1pTeXjwHfwSEFOeNZ9wQabUdxNE8XFdXlaKSjpmEoUS4dxNUqeDhE61SoSaAfIjRoHtMDGDidrR6Mi2xhYQ9YZ4M7Dqaz/gzDKXE2ixbMcU9b2njOAsIs2v5vpsMN9gqNdl+UC5NC1CEuYlTAP265gsCKOlITZ9 oweidner@peahi
diff --git a/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
new file mode 100644
index 0000000000000000000000000000000000000000..c645495448f9844de5ae9024b6a0f41452522765
--- /dev/null
+++ b/paddle/scripts/cluster_train_v2/openmpi/start_mpi_train.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# General trainning configurations
+
+NICS=eth0
+PADDLE_INIT_PORT=7164
+PADDLE_INIT_PORTS_NUM=1
+PADDLE_INIT_PORTS_NUM_FOR_SPARSE=1
+PADDLE_INIT_PSERVERS=$(cat machines | sed -e ':a' -e 'N' -e '$!ba' -e 's/\n/,/g')
+PADDLE_INIT_USE_GPU=False
+
+PADDLE_INIT_NUM_GRADIENT_SERVERS=${OMPI_COMM_WORLD_SIZE}
+PADDLE_INIT_TRAINER_ID=${OMPI_COMM_WORLD_RANK}
+PADDLE_CLUSTER_TRAIN=True
+
+env
+
+# start pserver
+stdbuf -oL nohup paddle pserver --port=$PADDLE_INIT_PORT --ports_num=$PADDLE_INIT_PORTS_NUM \
+  --ports_num_for_sparse=$PADDLE_INIT_PORTS_NUM_FOR_SPARSE --nics=$NICS \
+  --comment=paddle_cluster_pserver \
+  --num_gradient_servers=$PADDLE_INIT_NUM_GRADIENT_SERVERS &> logs/pserver.log &
+
+# start trainer
+# NOTE: train.py will use the above environment variables as configuration
+python train.py &> logs/train.log
+
+# kill background pservers when train finishes
+ps -ef | grep pserver | awk '{print $2}' | xargs kill
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2ac455d771bf78377ce4ee7d921393d3b3958e3c..a08716c5a559def54bb7b989f250b489f6a805a2 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -141,10 +141,17 @@ RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
 EOF
 fi
 
+if [[ ${WITH_GPU} == "ON"  ]]; then
+  NCCL_DEPS="apt-get install -y libnccl-dev &&"
+else
+  NCCL_DEPS="" 
+fi
+
 cat >> /paddle/build/Dockerfile <<EOF
 ADD python/dist/*.whl /
 # run paddle version to install python packages first
 RUN apt-get update &&\
+    ${NCCL_DEPS}\
     apt-get install -y wget python-pip && pip install -U pip && \
     pip install /*.whl; apt-get install -f -y && \
     apt-get clean -y && \
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 26f9c0fcd4e045f5d603fc4e4b16691a418823ca..5c4b5a2495182ea5d2b3341cff650dfb4d8b0c0f 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -18,7 +18,7 @@ function version(){
         echo "PaddlePaddle @PADDLE_VERSION@, compiled with"
         echo "    with_avx: @WITH_AVX@"
         echo "    with_gpu: @WITH_GPU@"
-        echo "    with_mkldnn: @WITH_MKLDNN"
+        echo "    with_mkldnn: @WITH_MKLDNN@"
         echo "    with_mklml: @WITH_MKLML@"
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index cb483b0ffc0a1d99978508bc16464a7716d2bac2..e71d243efa2041cc0624b8273e1bfabaa03ce106 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -11,7 +11,13 @@ set -e
 # install glide
 curl https://glide.sh/get | bash
 eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
-go get -u github.com/alecthomas/gometalinter
+
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
+go get github.com/alecthomas/gometalinter
 gometalinter --install
 
 cd $TRAVIS_BUILD_DIR
@@ -19,10 +25,7 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
 
-# set up go environment for running gometalinter
-mkdir -p $GOPATH/src/github.com/PaddlePaddle/
-ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
-cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 
 if ! pre-commit run -a ; then
     git diff
diff --git a/paddle/string/to_string_test.cc b/paddle/string/to_string_test.cc
index 542c771a98ec8ae187cd4f821ed1ee4373427041..971484dd0c073762e99f3926576eb21b96197769 100644
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
@@ -36,4 +36,4 @@ TEST(to_string, user_defined) {
   using namespace paddle::string;
   UserDefinedClass instance;
   ASSERT_EQ(kOutputString, to_string(instance));
-}
\ No newline at end of file
+}
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 18ae6cc9382a444d549824c8f8d57f962f17d427..a70673ffec8812d86b9a0c13f15ef0b378dcf3ce 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -29,8 +29,9 @@ using namespace std;     // NOLINT
 int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
+
   string confFile = FLAGS_config_file;
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   FLAGS_use_gpu = false;
 #endif
   auto config = std::make_shared<TrainerConfigHelper>(confFile);
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 066837ca959e46dbe3b39c661aa1bab11cbf2734..5ebbb99c94bce45d295ae0bf585f2cf864bfc4d4 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -39,15 +39,18 @@ add_test(NAME test_CompareTwoNets
 
 ################ test_CompareMKLDNNandCPU ######################
 if(WITH_MKLDNN)
-  add_unittest_without_exec(test_CompareMKLDNNandCPU
-      test_CompareTwoNets.cpp)
-  add_test(NAME test_CompareMKLDNNandCPU
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-          ${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU
-              --config_file_a=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_a=True
-              --config_file_b=trainer/tests/sample_trainer_config_simple_net.conf --use_mkldnn_b=False
-              --use_gpu=False
-      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+  macro(gen_command VAR_NAME CONFIG_FILE)
+    set(${VAR_NAME} "${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh" "-d" "${PADDLE_SOURCE_DIR}/python/"
+                    "${CMAKE_CURRENT_BINARY_DIR}/test_CompareMKLDNNandCPU --use_gpu=False"
+                    "--config_file_a=trainer/tests/${CONFIG_FILE} --use_mkldnn_a=True"
+                    "--config_file_b=trainer/tests/${CONFIG_FILE} --use_mkldnn_b=False"
+                    "WORKING_DIRECTORY" "${PADDLE_SOURCE_DIR}/paddle/")
+  endmacro()
+  add_unittest_without_exec(test_CompareMKLDNNandCPU test_CompareTwoNets.cpp)
+  gen_command(compare_simple_net "sample_trainer_config_simple_net.conf")
+  gen_command(compare_branch_net "sample_trainer_config_branch_net.conf")
+  add_test(NAME test_CompareMKLDNNandCPU_simple_net COMMAND ${compare_simple_net})
+  add_test(NAME test_CompareMKLDNNandCPU_branch_net COMMAND ${compare_branch_net})
 endif()
 
 ############### test_CompareTwoOpts ###################
diff --git a/paddle/trainer/tests/sample_trainer_config_branch_net.conf b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
new file mode 100644
index 0000000000000000000000000000000000000000..a073708a184d6392a4eead69272e684013f1c751
--- /dev/null
+++ b/paddle/trainer/tests/sample_trainer_config_branch_net.conf
@@ -0,0 +1,103 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+################################### Data Configuration ###################################
+TrainData(ProtoData(files = "trainer/tests/mnist.list"))
+################################### Algorithm Configuration ###################################
+settings(batch_size = 128,
+         learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
+################################### Network Configuration ###################################
+data = data_layer(name ="input", size=784)
+
+tmp = img_conv_layer(input=data,
+            num_channels=1,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+a1 = img_conv_layer(input=tmp,
+            filter_size=1,
+            num_filters=32,
+            padding=0,
+            shared_biases=True,
+            act=ReluActivation())
+
+a2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+tmp = addto_layer(input=[a1, a2],
+            act=ReluActivation(),
+            bias_attr=False)
+
+tmp = img_pool_layer(input=tmp,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=AvgPooling())
+
+b1 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=32,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b1 = img_pool_layer(input=b1,
+            pool_size=3,
+            stride=2,
+            padding=0,
+            pool_type=MaxPooling())
+
+b2 = img_conv_layer(input=tmp,
+            filter_size=3,
+            num_filters=64,
+            padding=1,
+            shared_biases=True,
+            act=ReluActivation())
+
+b2 = img_pool_layer(input=b2,
+            pool_size=5,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = concat_layer(input=[b1, b2])
+
+tmp = img_pool_layer(input=tmp,
+            num_channels=96,
+            pool_size=3,
+            stride=2,
+            padding=1,
+            pool_type=MaxPooling())
+
+tmp = fc_layer(input=tmp, size=64,
+            bias_attr=False,
+            act=TanhActivation())
+
+output = fc_layer(input=tmp, size=10,
+            bias_attr=True,
+            act=SoftmaxActivation())
+
+lbl = data_layer(name ="label", size=10)
+
+cost = classification_cost(input=output, label=lbl)
+outputs(cost)
diff --git a/paddle/trainer/tests/sample_trainer_config_simple_net.conf b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
index 77f78161535c49da4ef7fc1563cff58c021aecef..2ba71884d0953dc721808732fde12e695c6a757d 100644
--- a/paddle/trainer/tests/sample_trainer_config_simple_net.conf
+++ b/paddle/trainer/tests/sample_trainer_config_simple_net.conf
@@ -17,7 +17,7 @@ from paddle.trainer_config_helpers import *
 ################################### Data Configuration ###################################
 TrainData(ProtoData(files = "trainer/tests/mnist.list"))
 ################################### Algorithm Configuration ###################################
-settings(batch_size = 1000,
+settings(batch_size = 128,
          learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
 ################################### Network Configuration ###################################
 data = data_layer(name ="input", size=784)
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index e855a8fe2e09aa0f16a73f3e7bcc2f32921092f8..f3a964acb69be059a43470f7b68910a3b6cecaab 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -146,7 +146,7 @@ void compareGradient(comData& comDataCpu, comData& comDataGpu) {
 }
 
 int main(int argc, char** argv) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   exit(0);
 #endif
   paddle::initMain(argc, argv);
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 813275518e411d6e963e23df634541f771096e0f..5f1834bd730375fc10762fc19788d0c693f8e752 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -174,7 +174,7 @@ TEST(compareSparse, multiGradientMachine) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
@@ -198,7 +198,7 @@ TEST(compareSparse, NeuralNetwork) {
     FLAGS_local = local;
     FLAGS_ports_num_for_sparse = 5;
     for (bool useGpu : {false, true}) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
       if (useGpu) continue;
 #endif
       FLAGS_parallel_nn = useGpu;
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 264bc46ebcd0aa17fd605e537fcb2c316ef31162..425b3d10a38086463784ba2a18db1293efe96e92 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -51,7 +51,7 @@ void checkGradientTest(const string& configFile,
 
 TEST(checkGradient, cpu) { checkGradientTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkGradient, gpu) { checkGradientTest(configFile1, true, false); }
 
 TEST(checkGradient, multiGpu) {
@@ -97,7 +97,7 @@ TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
   checkGradientTest(configFile3, false, false);
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   checkGradientTest(configFile3, true, true);
 #endif
 }
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 00ba61377aeff17d82e03f7560c0d71b3570d14f..b2a93d4d5eea37ad716b59427f2aa4409d2f537d 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -79,7 +79,7 @@ void trainerOnePassTest(const string& configFile,
 // 1. test trainer (cpu, gpu).
 TEST(trainerOnePass, cpu) { trainerOnePassTest(configFile1, false, false); }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(trainerOnePass, gpu) { trainerOnePassTest(configFile1, true, false); }
 
 TEST(trainerOnePass, gpu2) { trainerOnePassTest(configFile1, true, false, 2); }
@@ -94,7 +94,7 @@ TEST(trainerOnePass, parallel) {
 #endif
 
 // 2. test average_window.
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(average_window, gpu) {
   trainerOnePassTest(configFile1, true, false, 4, 0.01);
 }
@@ -266,7 +266,7 @@ TEST(checkRemoteUpdater, cpuTrainerOldUpdater) {
   checkRemoteParameterUpdaterTest(configFile1, false, false, 1, true);
 }
 
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
 TEST(checkRemoteUpdater, gpuTrainer) {
   checkRemoteParameterUpdaterTest(configFile1, true, false);
 }
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index 1322e77178a4f5674f41943f886a17be8337bd75..a8fbe31c2b1e228107dfc19483444409bfcbf788 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -113,7 +113,7 @@ void testGeneration(const string& configFile,
 #ifndef PADDLE_TYPE_DOUBLE
 
 TEST(RecurrentGradientMachine, test_generation) {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   const auto useGpuConfs = {false};
 #else
   const auto useGpuConfs = {true, false};
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index ab1c181c62cdbee8cc5f804ec9aaf63ac5464ad6..8f100f02e90bcbc7fdcf6f053aec6f95cfb09c1a 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "Flags.h"
 
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
 DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index 22ce2534d3468ded36221810aa61c15b37f13f3d..9579881ea3b92abab0189631184bab515afb67a3 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -218,7 +218,7 @@ protected:
  * *d2* is peer device to enable direct access to by the d1 device.
  */
 inline void enablePeerAccess(int d1, int d2) {
-#ifndef PADDLE_ONLY_CPU
+#ifdef PADDLE_WITH_CUDA
   if (hl_device_can_access_peer(d1, d2)) {
     SetDevice dev(d1);
     hl_device_enable_peer_access(d2);
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index f53d6420bbbdf66f8f355af95c6b11c30a3bfab9..004d62451cddfee8fbd687938086e04ecb2332a9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -48,7 +48,7 @@ void printVersion(std::ostream& os);
  * @return return true if paddle compiled with GPU
  */
 constexpr bool isWithGpu() {
-#ifdef PADDLE_ONLY_CPU
+#ifndef PADDLE_WITH_CUDA
   return false;
 #else
   return true;
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 6212c2e60a8ed94ecc1d6e58535a2b3d365e3eb8..5d898d860cfc6dc26eaf5a81d8aed6d757ed5831 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,4 +1,10 @@
-file(GLOB proto_filenames . *.proto)
+if (MOBILE_INFERENCE)
+    file(GLOB proto_filenames . ModelConfig.proto ParameterConfig.proto
+         TrainerConfig.proto DataConfig.proto)
+else()
+    file(GLOB proto_filenames . *.proto)
+endif()
+
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 proto_library(paddle_proto SRCS ${proto_filenames})
 
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 248da4ae8d1fb24652625ae8fc9ef314a028b912..05635833bf1645f78f5ba15caee3e9b8da9f5544 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -175,7 +175,7 @@ def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
 
 dense_vector = dense_slot
 sparse_binary_vector = sparse_non_value_slot
-sparse_vector = sparse_value_slot
+sparse_float_vector = sparse_value_slot
 integer_value = index_slot
 
 # dense_array can be used for variable-length input feature.
@@ -216,7 +216,7 @@ def sparse_binary_vector_sub_sequence(dim):
     return sparse_binary_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def sparse_vector_sequence(dim):
+def sparse_float_vector_sequence(dim):
     """
     Data type of a sequence of sparse vector, which most elements are zero,
     others could be any float value.
@@ -226,11 +226,11 @@ def sparse_vector_sequence(dim):
     :return: An input type object
     :rtype: InputType
     """
-    return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
+    return sparse_float_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
-def sparse_vector_sub_sequence(dim):
-    return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
+def sparse_float_vector_sub_sequence(dim):
+    return sparse_float_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
 def integer_value_sequence(value_range):
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 098a51ab8791290d3e0ffa2c3703c724dd2387b9..09c92d3513e86a7657880c01736f5f41f53cfcf6 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -559,6 +559,9 @@ class IdentityOffsetProjection(Projection):
                                                        **xargs)
         self.proj_conf.offset = offset
 
+    def calc_output_size(self, input_layer_config):
+        return 0  # depends on the outside MixedLayer
+
     def calc_parameter_size(self, input_size, output_size):
         return 0
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 74025d2a7bb68f87afd24bb4b70ec425ba0dcb64..09315b9d9224076d91c16a6c0b949d4ab289bf70 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -142,6 +142,7 @@ __all__ = [
     'img_pool3d_layer',
     'scale_shift_layer',
     'img_conv3d_layer',
+    'resize_layer',
 ]
 
 
@@ -250,6 +251,8 @@ class LayerType(object):
     KMAX_SEQ_SCORE = 'kmax_seq_score'
     SCALE_SHIFT_LAYER = 'scale_shift'
 
+    RESIZE = 'resize'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -315,7 +318,7 @@ class LayerOutput(object):
     :param activation: Layer Activation.
     :type activation: BaseActivation.
     :param parents: Layer's parents.
-    :type parents: list|tuple|collections.Sequence
+    :type parents: list | tuple | collections.Sequence
     """
 
     def __init__(self,
@@ -432,7 +435,7 @@ def full_matrix_projection(input, size=0, param_attr=None):
                                      size=100,
                                      param_attr=ParamAttr(name='_proj'))
 
-    :param input: input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param size: The parameter size. Means the width of parameter.
     :type size: int
@@ -468,7 +471,7 @@ def trans_full_matrix_projection(input, size=0, param_attr=None):
                                                 initial_mean=0.0,
                                                 initial_std=0.01))
 
-    :param input: input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param size: The parameter size. Means the width of parameter.
     :type size: int
@@ -513,7 +516,7 @@ def table_projection(input, size=0, param_attr=None):
                                param_attr=ParamAttr(name='_proj'))
 
 
-    :param input: Input layer, which must contains id fields.
+    :param input: The input of this layer, which must contains id fields.
     :type input: LayerOutput
     :param size: The parameter size. Means the width of parameter.
     :type size: int
@@ -558,7 +561,7 @@ def identity_projection(input, offset=None, size=None):
 
     Note that both of two projections should not have any parameter.
 
-    :param input: Input Layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param offset: Offset, None if use default.
     :type offset: int
@@ -593,7 +596,7 @@ def slice_projection(input, slices):
 
     Note that slice_projection should not have any parameter.
 
-    :param input: Input Layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param slices: An array of slice parameters.
                    Each slice contains the start and end offsets based
@@ -631,7 +634,7 @@ def scaling_projection(input, param_attr=None):
 
        proj = scaling_projection(input=layer)
 
-    :param input: Input Layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
@@ -660,7 +663,7 @@ def dotmul_projection(input, param_attr=None):
 
        proj = dotmul_projection(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param param_attr: Parameter config, None if use default.
     :type param_attr: ParameterAttribute
@@ -731,7 +734,7 @@ def context_projection(input,
     after context projection and not set padding_attr, sequence will
     be [ 0AB ABC BCD CDE DEF EFG FG0 ].
 
-    :param input: Input Sequence.
+    :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
     :param context_len: context length.
     :type context_len: int
@@ -741,7 +744,7 @@ def context_projection(input,
     :param padding_attr: Padding Parameter Attribute. If false, it means padding
                          always be zero. Otherwise Padding is learnable, and
                          parameter attribute is set by this parameter.
-    :type padding_attr: bool|ParameterAttribute
+    :type padding_attr: bool | ParameterAttribute
     :return: Projection
     :rtype: Projection
     """
@@ -779,13 +782,13 @@ class MixedLayerType(LayerOutput):
         :type name: basestring
         :param size: layer size.
         :type size: int
-        :param act: activation type.
+        :param act: Activation type.
         :type act: BaseActivation
         :param bias_attr: The Bias Attribute. If the parameter is set to
                           False or something not type of ParameterAttribute,
                           no bias is defined. If the parameter is set to
                           True, the bias is initialized to zero.
-        :type bias_attr: ParameterAttribute|None|Bool|Any
+        :type bias_attr: ParameterAttribute | None | bool | Any
         :param layer_attr: Extra Layer Attribute.
         :type layer_attr: ExtraLayerAttribute or None
         """
@@ -877,15 +880,15 @@ def mixed_layer(size=0,
     :type name: basestring
     :param size: layer size.
     :type size: int
-    :param input: inputs layer. It is an optional parameter. If set,
+    :param input: The input of this layer. It is an optional parameter. If set,
                   then this function will just return layer's name.
-    :param act: Activation Type.
+    :param act: Activation Type. LinearActivation is the default.
     :type act: BaseActivation
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: The extra layer config. Default is None.
     :type layer_attr: ExtraLayerAttribute
     :return: MixedLayerType object can add inputs or layer name.
@@ -926,9 +929,9 @@ def data_layer(name, size, depth=None, height=None, width=None,
     :param size: Size of this data layer.
     :type size: int
     :param height: Height of this data layer, used for image
-    :type height: int|None
+    :type height: int | None
     :param width: Width of this data layer, used for image
-    :type width: int|None
+    :type width: int | None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -963,15 +966,15 @@ def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer for this embedding. NOTE: must be Index Data.
+    :param input: The input of this layer, which must be Index Data.
     :type input: LayerOutput
     :param size: The embedding dimension.
     :type size: int
     :param param_attr: The embedding parameter attribute. See ParameterAttribute
                       for details.
-    :type param_attr: ParameterAttribute|None
+    :type param_attr: ParameterAttribute | None
     :param layer_attr: Extra layer Config. Default is None.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1018,11 +1021,11 @@ def fc_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer. Could be a list/tuple of input layer.
-    :type input: LayerOutput|list|tuple
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
     :param size: The layer dimension.
     :type size: int
-    :param act: Activation Type. Default is tanh.
+    :param act: Activation Type. TanhActivation is the default.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute|list.
     :type param_attr: ParameterAttribute
@@ -1030,9 +1033,9 @@ def fc_layer(input,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1069,8 +1072,8 @@ def printer_layer(input, format=None, name=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer. Could be a list/tuple of input layer.
-    :type input: LayerOutput|list|tuple
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
     :return: LayerOutput
     """
     if isinstance(input, LayerOutput):
@@ -1107,7 +1110,7 @@ def priorbox_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param image: The network input image.
     :type image: LayerOutput
@@ -1303,7 +1306,7 @@ def cross_channel_norm_layer(input, name=None, param_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param param_attr: The Parameter Attribute|list.
     :type param_attr: ParameterAttribute
@@ -1368,20 +1371,20 @@ def pooling_layer(input,
     :type agg_level: AggregateLevel
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param pooling_type: Type of pooling, MaxPooling(default), AvgPooling,
                          SumPooling, SquareRootNPooling.
-    :type pooling_type: BasePoolingType|None
+    :type pooling_type: BasePoolingType | None
     :param stride: The step size between successive pooling regions.
     :type stride: Int
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: The Extra Attributes for layer, such as dropout.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1466,11 +1469,11 @@ def lstmemory(input,
     :type name: basestring
     :param size: DEPRECATED. size of the lstm cell
     :type size: int
-    :param input: input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param reverse: is sequence process reversed or not.
     :type reverse: bool
-    :param act: activation type, TanhActivation by default. :math:`h_t`
+    :param act: Activation type. TanhActivation is the default. :math:`h_t`
     :type act: BaseActivation
     :param gate_act: gate activation type, SigmoidActivation by default.
     :type gate_act: BaseActivation
@@ -1480,11 +1483,11 @@ def lstmemory(input,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute|None|False
+    :type param_attr: ParameterAttribute | None | False
     :param layer_attr: Extra Layer attribute
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1588,14 +1591,14 @@ def grumemory(input,
        gru = grumemory(input)
 
     :param name: The gru layer name.
-    :type name: None|basestring
-    :param input: input layer.
+    :type name: None | basestring
+    :param input: The input of this layer.
     :type input: LayerOutput.
     :param size: DEPRECATED. size of the gru cell
     :type size: int
     :param reverse: Whether sequence process is reversed or not.
     :type reverse: bool
-    :param act: activation type, TanhActivation by default. This activation
+    :param act: Activation type, TanhActivation is the default. This activation
                 affects the :math:`{\\tilde{h_t}}`.
     :type act: BaseActivation
     :param gate_act: gate activation type, SigmoidActivation by default.
@@ -1606,11 +1609,11 @@ def grumemory(input,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute.
-    :type param_attr: ParameterAttribute|None|False
+    :type param_attr: ParameterAttribute | None | False
     :param layer_attr: Extra Layer attribute
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1667,7 +1670,7 @@ def last_seq(input,
     :param agg_level: Aggregated level
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
     :type stride: Int
@@ -1723,7 +1726,7 @@ def first_seq(input,
     :param agg_level: aggregation level
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param stride: The step size between successive pooling regions.
     :type stride: Int
@@ -1796,7 +1799,7 @@ def expand_layer(input,
                              expand_as=layer2,
                              expand_level=ExpandLevel.FROM_NO_SEQUENCE)
 
-    :param input: Input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param expand_as: Expand as this layer's sequence info.
     :type expand_as: LayerOutput
@@ -1806,7 +1809,7 @@ def expand_layer(input,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param expand_level: whether input layer is timestep(default) or sequence.
     :type expand_level: ExpandLevel
     :param layer_attr: extra layer attributes.
@@ -1855,7 +1858,7 @@ def repeat_layer(input,
 
        expand = repeat_layer(input=layer, num_repeats=4)
 
-    :param input: Input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param num_repeats: Repeat the input so many times
     :type num_repeats: int
@@ -1866,7 +1869,7 @@ def repeat_layer(input,
                           False for treating input as column vector and repeating
                           in the row direction.
     :type as_row_vector: bool
-    :param act: Activation type.
+    :param act: Activation type. IdentityActivation is the default.
     :type act: BaseActivation
     :type name: basestring
     :param layer_attr: extra layer attributes.
@@ -1914,13 +1917,13 @@ def seq_reshape_layer(input,
 
        reshape = seq_reshape_layer(input=layer, reshape_size=4)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param reshape_size: the size of reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param act: Activation type.
+    :param act: Activation type. IdentityActivation is the default.
     :type act: BaseActivation
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
@@ -1928,7 +1931,7 @@ def seq_reshape_layer(input,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1967,8 +1970,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
 
        interpolation = interpolation_layer(input=[layer1, layer2], weight=layer3)
 
-    :param input: Input layer.
-    :type input: list|tuple
+    :param input: The input of this layer.
+    :type input: list | tuple
     :param weight: Weight layer.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
@@ -2020,11 +2023,11 @@ def bilinear_interp_layer(input,
     :param   input:        A input layer.
     :type    input:        LayerOutput.
     :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int|None
+    :type    out_size_x:   int | None
     :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int|None
+    :type    out_size_y:   int | None
     :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None|basestring
+    :type    name:         None | basestring
     :param   layer_attr:   Extra Layer attribute.
     :type    layer_attr:   ExtraLayerAttribute
     :return: LayerOutput object.
@@ -2072,7 +2075,7 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
        power = power_layer(input=layer1, weight=layer2)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param weight: Weight layer.
     :type weight: LayerOutput
@@ -2116,7 +2119,7 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
        scale = scaling_layer(input=layer1, weight=layer2)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param weight: Weight layer.
     :type weight: LayerOutput
@@ -2156,7 +2159,7 @@ def trans_layer(input, name=None, layer_attr=None):
 
        trans = trans_layer(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -2194,7 +2197,7 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
                           height=100,
                           width=100)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param height: The height of the sample matrix
     :type height: int
@@ -2303,22 +2306,21 @@ def hsigmoid(input,
         cost = hsigmoid(input=[layer1, layer2],
                         label=data_layer)
 
-    :param input: Input layers. It could be a LayerOutput or list/tuple of
-                 LayerOutput.
-    :type input: LayerOutput|list|tuple
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
     :param label: Label layer.
     :type label: LayerOutput
     :param num_classes: number of classes.
-    :type num_classes: int|None
+    :type num_classes: int | None
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute|None
+    :type param_attr: ParameterAttribute | None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -2426,40 +2428,40 @@ def img_conv_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Layer Input.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param filter_size: The x dimension of a filter kernel. Or input a tuple for
                         two image dimension.
-    :type filter_size: int|tuple|list
+    :type filter_size: int | tuple | list
     :param filter_size_y: The y dimension of a filter kernel. Since PaddlePaddle
                         currently supports rectangular filters, the filter's
                         shape will be (filter_size, filter_size_y).
-    :type filter_size_y: int|None
+    :type filter_size_y: int | None
     :param num_filters: Each filter group's number of filter
-    :param act: Activation type. Default is tanh
+    :param act: Activation type. ReluActivation is the default.
     :type act: BaseActivation
     :param groups: Group size of filters.
     :type groups: int
     :param stride: The x dimension of the stride. Or input a tuple for two image
                    dimension.
-    :type stride: int|tuple|list
+    :type stride: int | tuple | list
     :param stride_y: The y dimension of the stride.
     :type stride_y: int
     :param padding: The x dimension of the padding. Or input a tuple for two
                     image dimension
-    :type padding: int|tuple|list
+    :type padding: int | tuple | list
     :param padding_y: The y dimension of the padding.
     :type padding_y: int
     :param dilation: The x dimension of the dilation. Or input a tuple for two
                     image dimension
-    :type dilation: int|tuple|list
+    :type dilation: int | tuple | list
     :param dilation_y: The y dimension of the dilation.
     :type dilation_y: int
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param num_channels: number of input channels. If None will be set
                         automatically from previous output.
     :type num_channels: int
@@ -2613,15 +2615,15 @@ def img_pool_layer(input,
     :param padding: pooling padding width.
     :type padding: int
     :param padding_y: pooling padding height. It's equal to padding by default.
-    :type padding_y: int|None
+    :type padding_y: int | None
     :param name: name of pooling layer
     :type name: basestring.
-    :param input: layer's input
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param pool_size: pooling window width
     :type pool_size: int
     :param pool_size_y: pooling window height. It's eaqual to pool_size by default.
-    :type pool_size_y: int|None
+    :type pool_size_y: int | None
     :param num_channels: number of input channel.
     :type num_channels: int
     :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
@@ -2630,7 +2632,7 @@ def img_pool_layer(input,
     :param stride: stride width of pooling.
     :type stride: int
     :param stride_y: stride height of pooling. It is equal to stride by default.
-    :type stride_y: int|None
+    :type stride_y: int | None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :param ceil_mode: Wether to use ceil mode to calculate output height and with.
@@ -2740,20 +2742,20 @@ def img_pool3d_layer(input,
                                  pool_type=MaxPooling())
 
     :param padding: pooling padding width.
-    :type padding: int|tuple|list
+    :type padding: int | tuple | list
     :param name: name of pooling layer
     :type name: basestring.
-    :param input: layer's input
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param pool_size: pooling window width
-    :type pool_size: int|tuple|list
+    :type pool_size: int | tuple | list
     :param num_channels: number of input channel.
     :type num_channels: int
     :param pool_type: pooling type. MaxPooling or AvgPooling. Default is
                       MaxPooling.
     :type pool_type: BasePoolingType
     :param stride: stride width of pooling.
-    :type stride: int|tuple|list
+    :type stride: int | tuple | list
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :param ceil_mode: Wether to use ceil mode to calculate output height and with.
@@ -2852,7 +2854,7 @@ def spp_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: layer's input.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param num_channels: number of input channel.
     :type num_channels: int
@@ -2945,8 +2947,8 @@ def img_cmrnorm_layer(input,
         norm = img_cmrnorm_layer(input=net, size=5)
 
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
-    :param input: layer's input.
+    :type name: None | basestring
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param size: Normalize in number of :math:`size` feature maps.
     :type size: int
@@ -3021,7 +3023,7 @@ def batch_norm_layer(input,
                             batch_norm for CPU. Otherwise, select batch norm
                             type based on the specified type. If you use cudnn_batch_norm,
                             we suggested you use latest version, such as v5.1.
-    :type batch_norm_type: None|string, None or "batch_norm" or "cudnn_batch_norm"
+    :type batch_norm_type: None | string, None or "batch_norm" or "cudnn_batch_norm"
     :param act: Activation Type. Better be relu. Because batch
                      normalization will normalize input near zero.
     :type act: BaseActivation
@@ -3031,7 +3033,7 @@ def batch_norm_layer(input,
     :type num_channels: int
     :param bias_attr: :math:`\\beta`, better be zero when initialize. So the
                       initial_std=0, initial_mean=1 is best practice.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: :math:`\\gamma`, better be one when initialize. So the
                        initial_std=0, initial_mean=1 is best practice.
     :type param_attr: ParameterAttribute
@@ -3043,7 +3045,7 @@ def batch_norm_layer(input,
                              testing. If False, it will use the mean
                              and variance of current batch of test data for
                              testing.
-    :type use_global_stats: bool|None.
+    :type use_global_stats: bool | None.
     :param moving_average_fraction: Factor used in the moving average
                                    computation, referred to as facotr,
                                    :math:`runningMean = newMean*(1-factor)
@@ -3104,7 +3106,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
 
        sum_to_one_norm = sum_to_one_norm_layer(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -3140,7 +3142,7 @@ def row_l2_norm_layer(input, name=None, layer_attr=None):
 
        row_l2_norm_layer = row_l2_norm_layer(input=layer)
 
-    :param input: Input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -3198,14 +3200,14 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     :type name: basestring
     :param input: Input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
-    :type input: LayerOutput|list|tuple
-    :param act: Activation Type, default is tanh.
+    :type input: LayerOutput | list | tuple
+    :param act: Activation Type. LinearActivation is the default.
     :type act: BaseActivation
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3257,8 +3259,8 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: input layers or projections
-    :type input: list|tuple|collections.Sequence
-    :param act: Activation type.
+    :type input: list | tuple | collections.Sequence
+    :param act: Activation type. IdentityActivation is the default.
     :type act: BaseActivation
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3353,7 +3355,7 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
     :type a: LayerOutput
     :param b: input sequence layer
     :type b: LayerOutput
-    :param act: Activation type.
+    :param act: Activation type. IdentityActivation is the default.
     :type act: BaseActivation
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3361,7 +3363,7 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3437,9 +3439,9 @@ def memory(name,
     :param is_seq: DEPRECATED. is sequence for boot_layer
     :type is_seq: bool
     :param boot_layer: boot layer of memory.
-    :type boot_layer: LayerOutput|None
+    :type boot_layer: LayerOutput | None
     :param boot_bias: boot layer's bias
-    :type boot_bias: ParameterAttribute|None
+    :type boot_bias: ParameterAttribute | None
     :param boot_bias_active_type: boot layer's active type.
     :type boot_bias_active_type: BaseActivation
     :param boot_with_const_id: boot layer's id.
@@ -3534,19 +3536,17 @@ def lstm_step_layer(input,
     :type input: LayerOutput
     :param state: State Layer. :math:`c_{t-1}`
     :type state: LayerOutput
-    :param act: Activation type. Default is tanh
+    :param act: Activation type. TanhActivation is the default.
     :type act: BaseActivation
-    :param gate_act: Gate Activation Type. Default is sigmoid, and should
-                          be sigmoid only.
+    :param gate_act: Gate Activation Type. SigmoidActivation is the default.
     :type gate_act: BaseActivation
-    :param state_act: State Activation Type. Default is sigmoid, and should
-                           be sigmoid only.
+    :param state_act: State Activation Type. TanhActivation is the default.
     :type state_act: BaseActivation
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -3597,13 +3597,15 @@ def gru_step_layer(input,
     :param output_mem:
     :param size:
     :param act:
+    :type act: BaseActivation
     :param name: The name of this layer. It is optional.
-    :param gate_act:
+    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
+    :type gate_act: BaseActivation
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: the parameter_attribute for transforming the output_mem
                        from previous step.
     :param layer_attr:
@@ -3659,12 +3661,14 @@ def gru_step_naive_layer(input,
     :param size:
     :param name: The name of this layer. It is optional.
     :param act:
-    :param gate_act:
+    :type act: BaseActivation
+    :param gate_act: Activation type of this layer's two gates. Default is Sigmoid.
+    :type gate_act: BaseActivation
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr:
     :param layer_attr:
     :return:
@@ -3675,6 +3679,12 @@ def gru_step_naive_layer(input,
     if size is None:
         size = input.size / 3
 
+    if bias_attr and bias_attr.attr.get("parameter_name", None) is not None:
+        raise ValueError("You should not specify the field `name` in bias_attr."
+                         " Otherwise, the three biases, which correponding to "
+                         " the two gates and the mixed layer for computing Wx+b"
+                         ", will share the same parameter matrix unexpectedly.")
+
     def __gate__(gate_name, offset):
         with mixed_layer(
                 name=name + "_" + gate_name,
@@ -3783,15 +3793,15 @@ def recurrent_layer(input,
         out_{i} = act(in_{i} + out_{i+1} * W) \\ \\ \\text{for} \\ start <= i < end
 
 
-    :param input: Input Layer
+    :param input: The input of this layer.
     :type input: LayerOutput
-    :param act: activation.
+    :param act: Activation type. TanhActivation is the default.
     :type act: BaseActivation
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param param_attr: parameter attribute.
     :type param_attr: ParameterAttribute
     :param name: The name of this layer. It is optional.
@@ -3898,7 +3908,7 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                   StaticInput will be imported to each time step, and doesn't change
                   through time. It's a mechanism to access layer outside step function.
 
-    :type input: LayerOutput|StaticInput|SubsequenceInput|list|tuple
+    :type input: LayerOutput | StaticInput | SubsequenceInput | list | tuple
 
     :param reverse: If reverse is set true, the recurrent unit will process the
                     input sequence in a reverse order.
@@ -3913,7 +3923,7 @@ def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
                          of words in each sentence) with all layer group's outputs.
                          targetInlink should be one of the layer group's input.
 
-    :type targetInlink: LayerOutput|SubsequenceInput
+    :type targetInlink: LayerOutput | SubsequenceInput
 
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -4031,7 +4041,7 @@ def maxid_layer(input, name=None, layer_attr=None):
 
        maxid = maxid_layer(input=layer)
 
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -4109,7 +4119,7 @@ def eos_layer(input, eos_id, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Input layer name.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param eos_id: end id of sequence
     :type eos_id: int
@@ -4501,7 +4511,7 @@ def conv_projection(input,
                               num_filters=64,
                               num_channels=64)
 
-    :param input: input layer
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param filter_size: The x dimension of a filter kernel.
     :type filter_size: int
@@ -4526,7 +4536,7 @@ def conv_projection(input,
     :param param_attr: Convolution param attribute. None means default attribute
     :type param_attr: ParameterAttribute
     :param trans: whether it is convTrans or conv
-    :type trans: boolean
+    :type trans: bool
     :return: A DotMulProjection Object.
     :rtype: DotMulProjection
     """
@@ -4634,14 +4644,14 @@ def pad_layer(input,
                        pad_h=[0,0],
                        pad_w=[2,2])
 
-    :param input: layer's input.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param pad_c: padding size in channel dimension.
-    :type pad_c: list|None
+    :type pad_c: list | None
     :param pad_h: padding size in height dimension.
-    :type pad_h: list|None
+    :type pad_h: list | None
     :param pad_w: padding size in width dimension.
-    :type pad_w: list|None
+    :type pad_w: list | None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :param name: The name of this layer. It is optional.
@@ -4776,7 +4786,7 @@ def tensor_layer(a,
     :type b: LayerOutput
     :param size: the layer dimension.
     :type size: int.
-    :param act: Activation Type. Default is tanh.
+    :param act: Activation type. LinearActivation is the default.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
@@ -4784,9 +4794,9 @@ def tensor_layer(a,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4833,15 +4843,15 @@ def selective_fc_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
-    :type input: LayerOutput|list|tuple
+    :param input: The input of this layer.
+    :type input: LayerOutput | list | tuple
     :param select: The select layer. The output of select layer should be a
                    sparse binary matrix, and treat as the mask of selective fc.
                    If is None, acts exactly like fc_layer.
     :type select: LayerOutput
     :param size: The layer dimension.
     :type size: int
-    :param act: Activation Type. Default is tanh.
+    :param act: Activation type. TanhActivation is the default.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute.
     :type param_attr: ParameterAttribute
@@ -4849,9 +4859,9 @@ def selective_fc_layer(input,
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4903,12 +4913,12 @@ def sampling_id_layer(input, name=None, layer_attr=None):
 
        samping_id = sampling_id_layer(input=input)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -4941,7 +4951,7 @@ def slope_intercept_layer(input,
 
        scale = slope_intercept_layer(input=input, slope=-1.0, intercept=1.0)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -4950,7 +4960,7 @@ def slope_intercept_layer(input,
     :param intercept: the offset.
     :type intercept: float.
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5010,7 +5020,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None, layer_attr=None):
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5074,10 +5084,10 @@ def block_expand_layer(input,
                                          block_x=1,
                                          block_x=3)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param num_channels: The channel number of input layer.
-    :type num_channels: int|None
+    :type num_channels: int | None
     :param block_x: The width of sub block.
     :type block_x: int
     :param block_y: The width of sub block.
@@ -5091,9 +5101,9 @@ def block_expand_layer(input,
     :param padding_y: The padding size in vertical direction.
     :type padding_y: int
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5152,15 +5162,15 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
                              num_channels=128,
                              groups=4)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param num_channels: The channel number of input layer. If None will be set
                      automatically from previous output.
-    :type num_channels: int|None
+    :type num_channels: int | None
     :param groups: The group number of input layer.
     :type groups: int
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -5217,18 +5227,18 @@ def ctc_layer(input,
                       size=9055,
                       norm_by_times=True)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param label: The data layer of label with variable length.
     :type label: LayerOutput
     :param size: category numbers + 1.
     :type size: int
     :param name: The name of this layer. It is optional.
-    :type name: basestring|None
+    :type name: basestring | None
     :param norm_by_times: Whether to normalization by times. False by default.
     :type norm_by_times: bool
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5294,20 +5304,20 @@ def warp_ctc_layer(input,
                            blank=1000,
                            norm_by_times=False)
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param label: The data layer of label with variable length.
     :type label: LayerOutput
     :param size: category numbers + 1.
     :type size: int
     :param name: The name of this layer. It is optional.
-    :type name: basestring|None
+    :type name: basestring | None
     :param blank: the 'blank' label used in ctc
     :type blank: int
     :param norm_by_times: Whether to normalization by times. False by default.
     :type norm_by_times: bool
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5365,11 +5375,11 @@ def crf_layer(input,
     :param param_attr: Parameter attribute. None means default attribute
     :type param_attr: ParameterAttribute
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
+    :type name: None | basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5435,9 +5445,9 @@ def crf_decoding_layer(input,
     :param param_attr: Parameter attribute. None means default attribute
     :type param_attr: ParameterAttribute
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
+    :type name: None | basestring
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -5496,14 +5506,14 @@ def nce_layer(input,
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input: The input layers. It could be a LayerOutput of list/tuple of LayerOutput.
-    :type input: LayerOutput|list|tuple|collections.Sequence
+    :type input: LayerOutput | list | tuple | collections.Sequence
     :param label: label layer
     :type label: LayerOutput
     :param weight: weight layer, can be None(default)
     :type weight: LayerOutput
     :param num_classes: number of classes.
     :type num_classes: int
-    :param act: Activation, default is Sigmoid.
+    :param act: Activation type. SigmoidActivation is the default.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute|list.
     :type param_attr: ParameterAttribute
@@ -5512,12 +5522,12 @@ def nce_layer(input,
     :param neg_distribution: The distribution for generating the random negative labels.
                              A uniform distribution will be used if not provided.
                              If not None, its length must be equal to num_classes.
-    :type neg_distribution: list|tuple|collections.Sequence|None
+    :type neg_distribution: list | tuple | collections.Sequence | None
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: layer name.
@@ -5633,7 +5643,7 @@ def rank_cost(left,
                    It is an optional argument.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
+    :type name: None | basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
     :param layer_attr: Extra Layer Attribute.
@@ -5698,7 +5708,7 @@ def lambda_cost(input,
                           entire list of get gradient.
     :type max_sort_size: int
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
+    :type name: None | basestring
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -5742,7 +5752,7 @@ def cross_entropy(input,
     :param label: The input label.
     :type input: LayerOutput.
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param coeff: The cost is multiplied with coeff.
                   The coefficient affects the gradient in the backward.
     :type coeff: float.
@@ -5790,7 +5800,7 @@ def cross_entropy_with_selfnorm(input,
     :param label: The input label.
     :type input: LayerOutput.
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
     :param softmax_selfnorm_alpha: The scale factor affects the cost.
@@ -5827,10 +5837,10 @@ def sum_cost(input, name=None, layer_attr=None):
 
        cost = sum_cost(input=input_layer)
 
-    :param input: The first input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput.
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -5875,7 +5885,7 @@ def huber_regression_cost(input,
     :param label: The input label.
     :type input: LayerOutput.
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param delta: The difference between the observed and predicted values.
     :type delta: float.
     :param coeff: The coefficient affects the gradient in the backward.
@@ -5925,7 +5935,7 @@ def huber_classification_cost(input,
     :param label: The input label.
     :type input: LayerOutput.
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring.
+    :type name: None | basestring.
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float.
     :param layer_attr: Extra Layer Attribute.
@@ -5968,7 +5978,7 @@ def multi_binary_label_cross_entropy(input,
     :param label: The input label.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
+    :type name: None | basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
     :param layer_attr: Extra Layer Attribute.
@@ -6136,7 +6146,7 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     :param label: The input label.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
-    :type name: None|basestring
+    :type name: None | basestring
     :param coeff: The coefficient affects the gradient in the backward.
     :type coeff: float
     :param layer_attr: Extra Layer Attribute.
@@ -6223,7 +6233,7 @@ def dropout_layer(input, dropout_rate, name=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param dropout_rate: The probability of dropout.
     :type dropout_rate: float
@@ -6282,18 +6292,18 @@ def row_conv_layer(input,
        row_conv = row_conv_layer(input=input_layer, context_len=3)
 
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param context_len: The context length equals the lookahead step number
                         plus one.
     :type context_len: int
-    :param act: Activation Type. Default is linear activation.
+    :param act: Activation Type. LinearActivation is the default.
     :type act: BaseActivation
     :param param_attr: The Parameter Attribute. If None, the parameter will be
                        initialized smartly. It's better to set it by yourself.
     :type param_attr: ParameterAttribute
     :param layer_attr: Extra Layer config.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
 
@@ -6339,7 +6349,7 @@ def prelu_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param partial_sum: this parameter makes a group of inputs share a same weight.
 
@@ -6349,9 +6359,9 @@ def prelu_layer(input,
 
     :type partial_sum: int
     :param param_attr: The parameter attribute. See ParameterAttribute for details.
-    :type param_attr: ParameterAttribute|None
+    :type param_attr: ParameterAttribute | None
     :param layer_attr: Extra layer configurations. Default is None.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6404,37 +6414,37 @@ def gated_unit_layer(input,
     .. code-block:: python
         gated_unit = gated_unit_layer(size=128, input=input_layer))
 
-    :param input: input for this layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param size: output size of the gated unit.
     :type size: int
-    :param act: activation type of the projected input.
+    :param act: Activation type of the projected input. LinearActivation is the default.
     :type act: BaseActivation
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param gate_attr: Attributes to tune the gate output, for example, error
         clipping threshold, dropout and so on. See ExtraLayerAttribute for
         more details.
-    :type gate_attr: ExtraLayerAttribute|None
+    :type gate_attr: ExtraLayerAttribute | None
     :param gate_param_attr: Attributes to tune the learnable projected matrix
         parameter of the gate.
-    :type gate_param_attr: ParameterAttribute|None
+    :type gate_param_attr: ParameterAttribute | None
     :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
-    :type gate_bias_attr: ParameterAttribute|None
+    :type gate_bias_attr: ParameterAttribute | None
     :param inproj_attr: Attributes to the tune the projected input, for
         example, error clipping threshold, dropout and so on. See
         ExtraLayerAttribute for more details.
-    :type inproj_attr: ExtraLayerAttribute|None
+    :type inproj_attr: ExtraLayerAttribute | None
     :param inproj_param_attr: Attributes to tune the learnable parameter of
         the projection of input.
-    :type inproj_param_attr: ParameterAttribute|None
+    :type inproj_param_attr: ParameterAttribute | None
     :param inproj_bias_attr: Attributes to tune the learnable bias of
         projection of the input.
-    :type inproj_bias_attr: ParameterAttribute|None
+    :type inproj_bias_attr: ParameterAttribute | None
     :param layer_attr: Attributes to tune the final output of the gated unit,
         for example, error clipping threshold, dropout and so on. See
         ExtraLayerAttribute for more details.
-    :type layer_attr: ExtraLayerAttribute|None
+    :type layer_attr: ExtraLayerAttribute | None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6473,7 +6483,7 @@ def switch_order_layer(input,
                        act=None,
                        layer_attr=None):
     """
-    This layer switch dimension order of image input. 
+    This layer switch dimension order of image input.
     From order "batchSize, channels, height, width"
     to order "batchSize, height, width, channels".
 
@@ -6484,7 +6494,7 @@ def switch_order_layer(input,
        switch = switch_order(input=layer, name='switch', reshape_axis=reshape_axis)
        reshape = {'height':[ 0, 1, 2], 'width':[3]}
 
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -6518,7 +6528,7 @@ def switch_order_layer(input,
 @layer_support()
 def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     """
-    The crop layer crops images by offset and shape. User can set crop shape by
+    This layer crops images by offset and shape. User can set crop shape by
     args 'shape' explicitly or by reference input layer.
 
     The example usage is:
@@ -6526,10 +6536,10 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     .. code-block:: python
     crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
 
-    :param input: The input layer.If two inputs were setted,
-                    the second input will be regarded as reference input
-    :type input: LayerOutput or Sequence
-    :param offset: The crop offset
+    :param input: The input of this layer. If two inputs are given, the second input
+                  will be regarded as reference input.
+    :type input: LayerOutput | Sequence
+    :param offset: The crop offset.
     :type offset: Sequence
     :param axis: start axis to be cropped. To image input layer:
         - 0: batch size
@@ -6578,12 +6588,12 @@ def sub_nested_seq_layer(input, selected_indices, name=None):
 
     .. code-block:: python
 
-        sub_nest_seq = sub_nested_seq_layer(input=[data, selected_indices])
+        sub_nest_seq = sub_nested_seq_layer(input=data, selected_indices=selected_ids)
 
 
-    :param input: A nested sequence.
+    :param input: The input of this layer. It is a nested sequence.
     :type input: LayerOutput
-    :param selected_indices: a set of sequence indices in the nested sequence.
+    :param selected_indices: A set of sequence indices in the nested sequence.
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
@@ -6625,7 +6635,7 @@ def clip_layer(input, min, max, name=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
+    :param input: The input of this layer.
     :type input: LayerOutput.
     :param min: The lower threshold for clipping.
     :type min: double
@@ -6670,12 +6680,12 @@ def seq_slice_layer(input, starts, ends, name=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: input for this layer, it should be a sequence.
+    :param input: The input of this layer, which should be a sequence.
     :type input: LayerOutput
     :param starts: start indices to slice the input sequence.
-    :type starts: LayerOutput|None
+    :type starts: LayerOutput | None
     :param ends: end indices to slice the input sequence.
-    :type ends: LayerOutput|None
+    :type ends: LayerOutput | None
     :return: LayerOutput object.
     :rtype: LayerOutput
 
@@ -6724,9 +6734,9 @@ def kmax_seq_score_layer(input, name=None, beam_size=1):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer. It stores scores over a sequence or a nested
+    :param input: The input of this layer. It stores scores over a sequence or a nested
         sequence and its size must be 1.
-    :type input: LayerOutput.
+    :type input: LayerOutput
     :param beam_size: sequence indices with top beam_size scores are returned.
     :type beam_size: double
     :return: LayerOutput object.
@@ -6782,24 +6792,24 @@ def img_conv3d_layer(input,
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: Layer Input.
+    :param input: The input of this layer.
     :type input: LayerOutput
     :param filter_size: The x dimension of a filter kernel. Or input a list.
-    :type filter_size: int|tuple|list
+    :type filter_size: int | tuple | list
     :param num_filters: Each filter group's number of filter
-    :param act: Activation type. Default is tanh
+    :param act: Activation type. ReluActivation is the default.
     :type act: BaseActivation
     :param groups: Group size of filters.
     :type groups: int
     :param stride: The x dimension of the stride. Or input a tuple for two image
                    dimension.
-    :type stride: int|tuple|list
+    :type stride: int | tuple | list
     :param padding: The x dimension of the padding. Or input a tuple for two
                     image dimension
-    :type padding: int|tuple|list
+    :type padding: int | tuple | list
     :param bias_attr: Convolution bias attribute. None means default bias.
                       False means no bias.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :param num_channels: number of input channels. If None will be set
                         automatically from previous output.
     :type num_channels: int
@@ -6913,15 +6923,15 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param input: The input layer.
-    :type input: LayerOutput.
+    :param input: The input of this layer.
+    :type input: LayerOutput
     :param param_attr: The parameter attribute of scaling.
     :type param_attr: ParameterAttribute
     :param bias_attr: The Bias Attribute. If the parameter is set to
                       False or something not type of ParameterAttribute,
                       no bias is defined. If the parameter is set to
                       True, the bias is initialized to zero.
-    :type bias_attr: ParameterAttribute|None|Bool|Any
+    :type bias_attr: ParameterAttribute | None | bool | Any
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -6932,3 +6942,23 @@ def scale_shift_layer(input, name=None, param_attr=None, bias_attr=None):
         bias=ParamAttr.to_bias(bias_attr))
     return LayerOutput(
         name, LayerType.SCALE_SHIFT_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default("resize")
+def resize_layer(input, size, name=None):
+    """
+    The resize layer resizes the input matrix with a shape of [Height, Width]
+    into the output matrix with a shape of [Height x Width / size, size],
+    where size is the parameter of this layer indicating the output dimension.
+
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param size: The resized output dimension of this layer.
+    :type size: int
+    :return: A LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(name=name, type=LayerType.RESIZE, inputs=Input(input.name), size=size)
+    return LayerOutput(name, LayerType.RESIZE, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 93e8ac173e721d9623fce91f30ac4642d273caba..3821d075cba5d39b5808a39093b8570d9302b667 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -26,8 +26,9 @@ __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
     "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
     'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
-    'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
-    'bidirectional_lstm', 'inputs', 'outputs'
+    'simple_attention', 'dot_product_attention', 'simple_gru2',
+    'bidirectional_gru', 'text_conv_pool', 'bidirectional_lstm', 'inputs',
+    'outputs'
 ]
 
 ######################################################
@@ -1361,6 +1362,7 @@ def simple_attention(encoded_sequence,
                                 compute attention weight.
     :type transform_param_attr: ParameterAttribute
     :return: a context vector
+    :rtype: LayerOutput
     """
     assert encoded_proj.size == decoder_state.size
     proj_size = encoded_proj.size
@@ -1396,6 +1398,90 @@ def simple_attention(encoded_sequence,
         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
+@wrap_name_default()
+def dot_product_attention(encoded_sequence,
+                          attended_sequence,
+                          transformed_state,
+                          softmax_param_attr=None,
+                          name=None):
+    """
+    Calculate and return a context vector with dot-product attention mechanism.
+    The dimension of the context vector equals to that of the attended_sequence.
+
+    ..  math::
+
+        a(s_{i-1},h_{j}) & = s_{i-1}^\mathrm{T} h_{j}
+
+        e_{i,j} & = a(s_{i-1}, h_{j})
+
+        a_{i,j} & = \\frac{exp(e_{i,j})}{\\sum_{k=1}^{T_x}{exp(e_{i,k})}}
+
+        c_{i} & = \\sum_{j=1}^{T_{x}}a_{i,j}z_{j}
+
+    where :math:`h_{j}` is the jth element of encoded_sequence,
+    :math:`z_{j}` is the jth element of attended_sequence,
+    :math:`s_{i-1}` is transformed_state.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        context = dot_product_attention(encoded_sequence=enc_seq,
+                                        attended_sequence=att_seq,
+                                        transformed_state=state,)
+
+    :param name: A prefix attached to the name of each layer that defined inside
+                 the dot_product_attention.
+    :type name: basestring
+    :param softmax_param_attr: The parameter attribute of sequence softmax
+                               that is used to produce attention weight.
+    :type softmax_param_attr: ParameterAttribute
+    :param encoded_sequence: The output hidden vectors of the encoder.
+    :type encoded_sequence: LayerOutput
+    :param attended_sequence: The attention weight is computed by a feed forward neural
+                              network which has two inputs : decoder's transformed hidden
+                              state of previous time step and encoder's output.
+                              attended_sequence is the sequence to be attended.
+    :type attended_sequence: LayerOutput
+    :param transformed_state: The transformed hidden state of decoder in previous time step.
+                              Since the dot-product operation will be performed on it and the
+                              encoded_sequence, their dimensions must be equal. For flexibility,
+                              we suppose transformations of the decoder's hidden state have been
+                              done outside dot_product_attention and no more will be performed
+                              inside. Then users can use either the original or transformed one.
+    :type transformed_state: LayerOutput
+    :return: The context vector.
+    :rtype: LayerOutput
+    """
+    assert transformed_state.size == encoded_sequence.size
+
+    expanded = expand_layer(
+        input=transformed_state,
+        expand_as=encoded_sequence,
+        name='%s_expand' % name)
+
+    m = linear_comb_layer(
+        weights=expanded,
+        vectors=encoded_sequence,
+        name='%s_dot-product' % name)
+
+    attention_weight = fc_layer(
+        input=m,
+        size=1,
+        act=SequenceSoftmaxActivation(),
+        param_attr=softmax_param_attr,
+        name="%s_softmax" % name,
+        bias_attr=False)
+
+    scaled = scaling_layer(
+        weight=attention_weight,
+        input=attended_sequence,
+        name='%s_scaling' % name)
+
+    return pooling_layer(
+        input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
+
+
 def inputs(layers, *args):
     """
     Declare the inputs of network. The order of input should be as same as
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 8a204a96f3ef57673cef65306d0bf8e8c3409751..6a4550c209762362d40f8a2afaf526a1fe53ca6b 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -10,6 +10,6 @@ test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_la
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_pooling3D_layer
-test_conv3d_layer test_deconv3d_layer test_BatchNorm3D)
+test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..9399252b23d0ec0cce918196bf4077a51e757eaf
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_resize_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__resize_0__"
+  type: "resize"
+  size: 150
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__resize_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__resize_0__"
+  input_layer_names: "input"
+  output_layer_names: "__resize_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..09a6f507338c1da8e9ce60555f8ca2576704170c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_resize_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+resized = resize_layer(input=data, size=150)
+
+outputs(resized)
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index e66bf67d7949057486eb54c46f39128fad5dae55..a0ffd31c545eb10dd8c2f14746ee90df58700e61 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -10,7 +10,8 @@ There are:
 * EndPass
 """
 __all__ = [
-    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
+    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult',
+    'EndForwardBackward'
 ]
 
 
@@ -73,6 +74,17 @@ class BeginIteration(object):
         self.batch_id = batch_id
 
 
+class EndForwardBackward(object):
+    """
+    Event On One Batch ForwardBackward Complete.
+    """
+
+    def __init__(self, pass_id, batch_id, gm):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+        self.gm = gm
+
+
 class EndIteration(WithMetric):
     """
     Event On One Batch Training Complete.
diff --git a/python/paddle/v2/framework/backward.py b/python/paddle/v2/framework/backward.py
new file mode 100644
index 0000000000000000000000000000000000000000..6827792cb351243f926aeca5f37324dc987d6a79
--- /dev/null
+++ b/python/paddle/v2/framework/backward.py
@@ -0,0 +1,45 @@
+from paddle.v2.framework import framework as framework
+
+__all__ = ['append_backward_ops']
+
+
+def append_backward_ops(loss, parameter_list=None, no_grad_set=None):
+    """
+    Create and add gradient Operators in BlockDesc to compute
+    gradients of `loss` for parameters in parameter_list
+
+    :param loss: an variable generated by cost function.
+    :type loss: Variable
+    :param no_grad_set: variable that should not create gradient
+    :type no_grad_set: set
+    :param parameter_list: parameters that need to compute gradient and 
+    update to optimize the lost.
+    :type: list
+    :return: list of (parameters, gradients) pair.
+    :rtype: list[Variable]
+    """
+    assert isinstance(loss, framework.Variable)
+    param_grad_map = loss.block.program.append_backward(loss, no_grad_set or
+                                                        set())
+    if parameter_list is not None:
+        parameters = parameter_list
+    else:
+        params = loss.block.program.global_block().all_parameters()
+        parameters = [param.name for param in params]
+    params_and_grads = []
+    for param in parameters:
+        if param not in param_grad_map:
+            raise ValueError("param %s is not in map" % param)
+        grad_info = param_grad_map[param]
+        grad_block = loss.block.program.block(grad_info[1])
+        if not grad_block.has_var(grad_info[0]):
+            raise ValueError("grad block[{0}] did not have grad var {1}".format(
+                grad_info[1], grad_info[0]))
+        # Get the param var from the global block
+        param_var = loss.block.program.global_block().var(param)
+        grad_var = grad_block.var(grad_info[0])
+        if loss.block.has_var(grad_info[0]):
+            params_and_grads.append((param_var, grad_var))
+        else:
+            params_and_grads.append((param_var, None))
+    return params_and_grads
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
index 1b5580c8b30f69016f187b1d8710a57b5f7cfa9f..c07f9a6ab96ac86fd6d20fbe0bc560845107f063 100644
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -5,7 +5,7 @@ Default scope function.
 thread-local stack of Scope. Top of that stack is current scope, the bottom 
 of that stack is all scopes' parent. 
 
-Invoking `new_var/find_var`  can `new/find` variable in current scope. 
+Invoking `var/find_var`  can `new/find` variable in current scope. 
 Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
 scope. 
 
@@ -19,7 +19,7 @@ import threading
 __tl_scope__ = threading.local()
 
 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'var',
     'find_var', 'scoped_function'
 ]
 
@@ -54,11 +54,11 @@ def leave_local_scope():
     get_cur_scope().drop_kids()
 
 
-def new_var(name):
+def var(name):
     """
     create variable in current scope.
     """
-    return get_cur_scope().new_var(name)
+    return get_cur_scope().var(name)
 
 
 def find_var(name):
diff --git a/python/paddle/v2/framework/executor.py b/python/paddle/v2/framework/executor.py
new file mode 100644
index 0000000000000000000000000000000000000000..82b83d4bb6ac9d4c6a67d925db290c7c5e2d933f
--- /dev/null
+++ b/python/paddle/v2/framework/executor.py
@@ -0,0 +1,64 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.framework import Block, Program
+
+g_scope = core.Scope()
+
+
+class Executor(object):
+    def __init__(self, places):
+        if not isinstance(places, list) and not isinstance(places, tuple):
+            places = [places]
+
+        act_places = []
+        for each in places:
+            p = core.Place()
+            p.set_place(each)
+            act_places.append(p)
+
+        self.executor = core.Executor(act_places)
+
+    def run(self,
+            program,
+            feed,
+            fetch_list,
+            feed_var_name='feed',
+            fetch_var_name='fetch',
+            scope=None):
+        if not isinstance(program, Program):
+            raise TypeError()
+
+        if scope is None:
+            scope = g_scope
+
+        program = program.clone()
+        global_block = program.global_block()
+        feed_var = global_block.create_var(
+            name=feed_var_name,
+            type=core.VarDesc.VarType.FEED_MINIBATCH,
+            persistable=True)
+
+        for i, name in enumerate(feed):
+            out = global_block.var(name)
+            global_block.prepend_op(
+                'feed',
+                inputs={'X': [feed_var]},
+                outputs={'Out': [out]},
+                attrs={'col': i})
+            core.set_feed_variable(scope, feed[name], feed_var.name, i)
+
+        fetch_var = global_block.create_var(
+            name=fetch_var_name,
+            type=core.VarDesc.VarType.FETCH_LIST,
+            persistable=True)
+        for i, var in enumerate(fetch_list):
+            global_block.append_op(
+                type='fetch',
+                inputs={'X': [var]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i})
+
+        self.executor.run(program.desc, scope, 0)
+        return [
+            core.get_fetch_variable(scope, fetch_var_name, i)
+            for i in xrange(len(fetch_list))
+        ]
diff --git a/python/paddle/v2/framework/framework.py b/python/paddle/v2/framework/framework.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3f8be8be9ac5c0c6c15646d39d4796df0fd87e2
--- /dev/null
+++ b/python/paddle/v2/framework/framework.py
@@ -0,0 +1,504 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.framework_pb2 as framework_pb2
+import collections
+import numpy as np
+import copy
+
+__all__ = ['Block', 'Variable', 'Program', 'Operator']
+
+
+class Variable(object):
+    def __init__(self,
+                 block,
+                 type=core.VarDesc.VarType.LOD_TENSOR,
+                 name=None,
+                 shape=None,
+                 dtype=None,
+                 lod_level=None,
+                 persistable=None,
+                 **kwargs):
+        self.block = block
+
+        if name is None:
+            name = Variable._unique_var_name_()
+        is_new_var = False
+        self.desc = self.block.desc.find_var(name)
+
+        if self.desc is None:
+            self.desc = self.block.desc.var(name)
+            is_new_var = True
+
+        if is_new_var:
+            self.desc.set_type(type)
+        elif self.desc.type() != type:
+            raise ValueError("Variable {0} has been created before. The "
+                             "previous type is {1}; the new type is {2}. They"
+                             " are not matched".format(self.name,
+                                                       self.desc.type(), type))
+
+        if shape is not None:
+            if is_new_var:
+                self.desc.set_shape(shape)
+            else:
+                old_shape = self.shape
+                shape = tuple(shape)
+                if shape != old_shape:
+                    raise ValueError(
+                        "Variable {0} has been created before. the previous "
+                        "shape is {1}; the new shape is {2}. They are not "
+                        "matched.".format(self.name, old_shape, shape))
+        if dtype is not None:
+            if not isinstance(dtype, core.DataType):
+                dtype = Variable._convert_np_dtype_to_dtype_(dtype)
+            if is_new_var:
+                self.desc.set_data_type(dtype)
+            else:
+                old_dtype = self.data_type
+                if dtype != old_dtype:
+                    raise ValueError("Variable {0} has been created before. "
+                                     "The previous data type is {1}; the new "
+                                     "data type is {2}. They are not "
+                                     "matched.".format(self.name, old_dtype,
+                                                       dtype))
+
+        if lod_level is not None:
+            if is_new_var:
+                self.desc.set_lod_level(lod_level)
+            else:
+                if lod_level != self.lod_level:
+                    raise ValueError("Variable {0} has been created before. "
+                                     "The previous lod_level is {1}; the new "
+                                     "lod_level is {2}. They are not "
+                                     "matched".format(self.name, self.lod_level,
+                                                      lod_level))
+        if persistable is not None:
+            if is_new_var:
+                self.desc.set_persistable(persistable)
+            else:
+                if persistable != self.persistable:
+                    raise ValueError(
+                        "Variable {0} has been created before."
+                        "The previous persistable is {1}; the new "
+                        "persistable is {2}. They are not matched".format(
+                            self.name, self.persistable, persistable))
+
+        self.block.vars[name] = self
+        self.op = None
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.VarDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    @property
+    def persistable(self):
+        return self.desc.persistable()
+
+    @property
+    def name(self):
+        return self.desc.name()
+
+    @property
+    def shape(self):
+        # convert to tuple, make it as same as numpy API.
+        return tuple(self.desc.shape())
+
+    @property
+    def data_type(self):
+        return self.desc.data_type()
+
+    @property
+    def lod_level(self):
+        return self.desc.lod_level()
+
+    @property
+    def type(self):
+        return self.desc.type()
+
+    @staticmethod
+    def _unique_var_name_():
+        uid = core.unique_integer()  # unique during whole process.
+        return "_generated_var_%d" % uid
+
+    @staticmethod
+    def _convert_np_dtype_to_dtype_(np_dtype):
+        dtype = np.dtype(np_dtype)
+        if dtype == np.float32:
+            return core.DataType.FP32
+        elif dtype == np.float64:
+            return core.DataType.FP64
+        elif dtype == np.float16:
+            return core.DataType.FP16
+        elif dtype == np.int32:
+            return core.DataType.INT32
+        elif dtype == np.int16:
+            return core.DataType.INT16
+        elif dtype == np.int64:
+            return core.DataType.INT64
+        elif dtype == np.bool:
+            return core.DataType.BOOL
+        else:
+            raise ValueError("Not supported numpy dtype " + str(dtype))
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from PaddlePaddle C++ end.
+    :return: A list of registered OpProto.
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = framework_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+class OpProtoHolder(object):
+    @classmethod
+    def instance(cls):
+        if not hasattr(cls, '_instance'):
+            cls._instance = cls()
+        return cls._instance
+
+    def __init__(self):
+        assert not hasattr(
+            self.__class__,
+            '_instance'), 'Please use `instance()` to get OpProtoHolder opject!'
+        op_protos = get_all_op_protos()
+        self.op_proto_map = {}
+        for proto in op_protos:
+            self.op_proto_map[proto.type] = proto
+
+    def get_op_proto(self, type):
+        if type not in self.op_proto_map:
+            raise ValueError("Operator \"%s\" has not been registered." % type)
+        return self.op_proto_map[type]
+
+
+class Operator(object):
+    def __init__(self,
+                 block,
+                 desc,
+                 type=None,
+                 inputs=None,
+                 outputs=None,
+                 attrs=None):
+        self.block = block
+        self.desc = desc
+        if len(self.desc.type()) != 0:
+            return
+        if type is None:
+            raise ValueError(
+                "`type` to initilized an Operator can not be None.")
+        self.desc.set_type(type)
+        proto = OpProtoHolder.instance().get_op_proto(type)
+
+        def find_name(var_list, name):
+            for var_name in var_list:
+                if var_name == name:
+                    return True
+            return False
+
+        if inputs is not None:
+            for in_proto in proto.inputs:
+                found = find_name(inputs, in_proto.name)
+                assert found or in_proto.dispensable, "Input {} not found".format(
+                    in_proto.name)
+
+                if found:
+                    in_argus = inputs[in_proto.name]
+                    if not isinstance(in_argus, list):
+                        in_argus = [in_argus]
+                    if not in_proto.duplicable and len(in_argus) > 1:
+                        raise ValueError(
+                            "Input %s expects only one input, but %d are given."
+                            % (in_proto.name, len(in_argus)))
+                    in_argu_names = []
+                    for argu in in_argus:
+                        in_argu_names.append(argu.name)
+                    self.desc.set_input(in_proto.name, in_argu_names)
+                else:
+                    self.desc.set_input(in_proto.name, [])
+
+        if outputs is not None:
+            given = set()
+            need = set()
+            for n in outputs:
+                given.add(n)
+            for m in proto.outputs:
+                need.add(m.name)
+            if not given == need:
+                raise ValueError(
+                    "Incorrect setting for output(s) of operator \"%s\". Need: [%s] Given: [%s]"
+                    % (type, ", ".join(str(e) for e in need), ", ".join(
+                        str(e) for e in given)))
+
+            for out_proto in proto.outputs:
+                out_argus = outputs[out_proto.name]
+                if not isinstance(out_argus, list):
+                    out_argus = [out_argus]
+                if not out_proto.duplicable and len(out_argus) > 1:
+                    raise ValueError(
+                        "Output %s expects only one output, but %d are given." %
+                        (out_proto.name, len(out_argus)))
+                out_argu_names = []
+                for argu in out_argus:
+                    out_argu_names.append(argu.name)
+                    argu.op = self
+                self.desc.set_output(out_proto.name, out_argu_names)
+
+        if attrs is not None:
+            for attr in proto.attrs:
+                attr_name = attr.name
+                if (not attr_name in attrs) or (attrs[attr_name] is None):
+                    continue
+                if isinstance(attrs[attr_name], Block):
+                    self.desc.set_block_attr(attr_name, attrs[attr_name].desc)
+                else:
+                    self.desc.set_attr(attr_name, attrs[attr_name])
+
+        self.desc.check_attrs()
+        no_kernel_op_set = {'feed', 'fetch', 'save', 'restore'}
+        if type not in no_kernel_op_set:
+            self.desc.infer_var_type(self.block.desc)
+            self.desc.infer_shape(self.block.desc)
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.OpDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    @property
+    def type(self):
+        return self.desc.type()
+
+    def input(self, name):
+        return self.desc.input(name)
+
+    @property
+    def input_names(self):
+        return self.desc.input_names()
+
+    def output(self, name):
+        return self.desc.output(name)
+
+    @property
+    def output_names(self):
+        return self.desc.output_names()
+
+    def has_attr(self, name):
+        return self.desc.has_attr(name)
+
+    def attr_type(self, name):
+        return self.desc.attr_type(name)
+
+    @property
+    def attr_names(self):
+        return self.desc.attr_names()
+
+    def attr(self, name):
+        return self.desc.attr(name)
+
+    def block_attr(self, name):
+        return self.desc.block_attr(name)
+
+
+class Block(object):
+    def __init__(self, program, idx):
+        self.desc = program.desc.block(idx)
+        self.vars = dict()  # var_name --> var
+        self.ops = collections.deque()  # operator list
+        self.program = program
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.BlockDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    __repr__ = __str__
+
+    @property
+    def parent_idx(self):
+        return self.desc.parent
+
+    @property
+    def idx(self):
+        return self.desc.id
+
+    def var(self, name):
+        if not isinstance(name, basestring):
+            raise TypeError()
+        v = self.vars.get(name, None)
+        if v is None:
+            raise ValueError("var %s not in this block" % name)
+        return v
+
+    def all_parameters(self):
+        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+
+    def create_var(self, *args, **kwargs):
+        return Variable(self, *args, **kwargs)
+
+    def has_var(self, name):
+        return name in self.vars
+
+    def create_parameter(self, *args, **kwargs):
+        global_block = self.program.global_block()
+        param = Parameter(global_block, *args, **kwargs)
+        if 'init_attr' in kwargs:
+            self._prepend_initialize_ops_(param, kwargs['init_attr'])
+        return param
+
+    def append_op(self, *args, **kwargs):
+        op_desc = self.desc.append_op()
+        op = Operator(self, op_desc, *args, **kwargs)
+        self.ops.append(op)
+        return op
+
+    def prepend_op(self, *args, **kwargs):
+        op_desc = self.desc.prepend_op()
+        op = Operator(self, op_desc, *args, **kwargs)
+        self.ops.appendleft(op)
+        return op
+
+    def sync_with_cpp(self):
+        # sync variables from cpp
+        for var in self.desc.all_vars():
+            if not self.has_var(var.name()):
+                self.create_var(name=var.name(), desc=var, type=var.type())
+
+        # sync operators from cpp
+        ops_in_cpp = []
+        for op_idx in range(0, self.desc.op_size()):
+            ops_in_cpp.append(self.desc.op(op_idx))
+
+        if len(self.ops) != 0:
+            first_op_in_python = self.ops[0].desc
+            last_op_in_python = self.ops[len(self.ops) - 1].desc
+            start_index = None
+            end_index = None
+            for index in range(len(ops_in_cpp)):
+                if first_op_in_python == ops_in_cpp[index]:
+                    start_index = index
+                if last_op_in_python == ops_in_cpp[index]:
+                    end_index = index
+            assert start_index is not None
+            assert end_index is not None
+            assert start_index <= end_index
+        else:
+            start_index = 0
+            end_index = -1
+
+        # sync ops append to the head of cpp_ops
+        for index in range((start_index - 1 - 1), -1, -1):
+            op_desc = ops_in_cpp[index]
+            op = Operator(self, op_desc)
+            self.ops.appendleft(op)
+
+        # sync ops append to the end of cpp_ops
+        for index in range((end_index + 1), len(ops_in_cpp)):
+            op_desc = ops_in_cpp[index]
+            op = Operator(self, op_desc)
+            self.ops.append(op)
+
+        assert len(self.ops) == len(ops_in_cpp)
+        for index in range(len(self.ops)):
+            assert self.ops[index].desc == ops_in_cpp[index]
+
+    def _prepend_initialize_ops_(self, param, init_attr):
+        op_type = init_attr['type']
+        init_attr['shape'] = param.shape
+        init_attr['data_type'] = int(param.data_type)
+        op = self.prepend_op(
+            type=op_type,
+            inputs=None,
+            outputs={'Out': [param]},
+            attrs=init_attr)
+        param.op = op
+
+
+class Program(object):
+    def __init__(self):
+        self.desc = core.ProgramDesc()
+        self.blocks = [Block(self, 0)]
+        self.current_block_idx = 0
+
+    def __str__(self):
+        protostr = self.desc.serialize_to_string()
+        proto = framework_pb2.ProgramDesc.FromString(str(protostr))
+        return proto.__str__()
+
+    def clone(self):
+        p = Program()
+        p.desc = core.ProgramDesc(self.desc)
+        p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+        p.sync_with_cpp()
+        return p
+
+    def __repr__(self):
+        return str(self)
+
+    def global_block(self):
+        return self.blocks[0]
+
+    def block(self, index):
+        return self.blocks[index]
+
+    def current_block(self):
+        return self.blocks[self.current_block_idx]
+
+    def append_backward(self, target, no_grad_set=None):
+        """
+        return map(param_name -> (grad_name, block_index, op_index))
+        """
+        assert isinstance(target, Variable)
+        if no_grad_set is None:
+            no_grad_set = set()
+        param_to_grad_info = self.desc.append_backward(target.desc, no_grad_set)
+        self.sync_with_cpp()
+        return param_to_grad_info
+
+    def create_block(self):
+        new_block_idx = len(self.blocks)
+        self.desc.append_block(self.current_block().desc)
+        self.current_block_idx = new_block_idx
+        self.blocks.append(Block(self, self.current_block_idx))
+        return self.current_block()
+
+    def rollback(self):
+        self.current_block_idx = self.current_block().parent_idx
+
+    def sync_with_cpp(self):
+        for block_idx in range(len(self.blocks), self.desc.num_blocks()):
+            self.blocks.append(Block(self, block_idx))
+        for block in self.blocks:
+            block.sync_with_cpp()
+
+
+class Parameter(Variable):
+    def __init__(self, block, shape, dtype, **kwargs):
+        if shape is None or dtype is None:
+            raise ValueError("Parameter must set shape and dtype")
+        if len(shape) == 0:
+            raise ValueError("Parameter shape cannot be empty")
+
+        for each in shape:
+            if each < 0:
+                raise ValueError("Parameter shape should not be related with "
+                                 "batch-size")
+
+        Variable.__init__(
+            self, block, persistable=True, shape=shape, dtype=dtype, **kwargs)
+        self.trainable = kwargs.get('trainable', True)
+
+        self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
+
+
+# program is a global instance.
+g_program = Program()
+g_init_program = Program()
diff --git a/python/paddle/v2/framework/layer_helper.py b/python/paddle/v2/framework/layer_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3da32f0e07a22204b3feaed5d1d8d01556e4655
--- /dev/null
+++ b/python/paddle/v2/framework/layer_helper.py
@@ -0,0 +1,172 @@
+import copy
+import itertools
+
+import paddle.v2.framework.core as core
+
+from paddle.v2.framework.framework import Variable, g_program, \
+    g_init_program
+
+
+def unique_name(prefix):
+    uid = core.unique_integer()  # unique during whole process.
+    return "_".join([prefix, str(uid)])
+
+
+class LayerHelper(object):
+    def __init__(self, layer_type, **kwargs):
+        self.kwargs = kwargs
+        self.layer_type = layer_type
+        name = self.kwargs.get('name', None)
+        if name is None:
+            self.kwargs['name'] = unique_name(self.layer_type)
+
+    @property
+    def name(self):
+        return self.kwargs['name']
+
+    @property
+    def program(self):
+        prog = self.kwargs.get('program', None)
+        if prog is None:
+            return g_program
+        else:
+            return prog
+
+    @property
+    def init_program(self):
+        prog = self.kwargs.get('init_program', None)
+        if prog is None:
+            return g_init_program
+        else:
+            return prog
+
+    def append_op(self, *args, **kwargs):
+        return self.program.current_block().append_op(*args, **kwargs)
+
+    def multiple_input(self, input_param_name='input'):
+        inputs = self.kwargs.get(input_param_name, [])
+        type_error = TypeError(
+            "Input of {0} layer should be Variable or sequence of Variable".
+            format(self.layer_type))
+        if isinstance(inputs, Variable):
+            inputs = [inputs]
+        elif not isinstance(inputs, list) and not isinstance(inputs, tuple):
+            raise type_error
+        else:
+            for each in inputs:
+                if not isinstance(each, Variable):
+                    raise type_error
+        return inputs
+
+    def input(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        if len(inputs) != 1:
+            raise "{0} layer only takes one input".format(self.layer_type)
+        return inputs[0]
+
+    @property
+    def param_attr(self):
+        default = {
+            'name': None,
+            'init_attr': {
+                'type': 'uniform_random',
+                'min': -1.0,
+                'max': 1.0
+            }
+        }
+        actual = self.kwargs.get('param_attr', None)
+        return actual if actual is not None else default
+
+    def bias_attr(self):
+        bias_attr = self.kwargs.get('bias_attr', None)
+        if bias_attr is True:
+            bias_attr = {
+                'name': None,
+                'init_attr': {
+                    'type': 'fill_constant',
+                    'value': 0.0
+                }
+            }
+        return bias_attr
+
+    def multiple_param_attr(self, length):
+        param_attr = self.param_attr
+        if isinstance(param_attr, dict):
+            param_attr = [param_attr]
+
+        if len(param_attr) != 1 and len(param_attr) != length:
+            raise ValueError("parameter number mismatch")
+        elif len(param_attr) == 1 and length != 1:
+            tmp = [None] * length
+            for i in xrange(length):
+                tmp[i] = copy.deepcopy(param_attr[0])
+            param_attr = tmp
+        return param_attr
+
+    def iter_inputs_and_params(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        param_attrs = self.multiple_param_attr(len(inputs))
+        for ipt, param_attr in itertools.izip(inputs, param_attrs):
+            yield ipt, param_attr
+
+    def input_dtype(self, input_param_name='input'):
+        inputs = self.multiple_input(input_param_name)
+        dtype = None
+        for each in inputs:
+            if dtype is None:
+                dtype = each.data_type
+            elif dtype != each.data_type:
+                raise ValueError("Data Type mismatch")
+        return dtype
+
+    def create_parameter(self, attr, shape, dtype, suffix='w'):
+        if attr['name'] is None:
+            attr['name'] = unique_name(".".join([self.name, suffix]))
+        self.init_program.global_block().create_parameter(
+            dtype=dtype, shape=shape, **attr)
+        return self.program.global_block().create_parameter(
+            name=attr['name'], dtype=dtype, shape=shape)
+
+    def create_tmp_variable(self, dtype):
+        return self.program.current_block().create_var(
+            name=unique_name(".".join([self.name, 'tmp'])),
+            dtype=dtype,
+            persistable=False)
+
+    def create_variable(self, *args, **kwargs):
+        return self.program.current_block().create_var(*args, **kwargs)
+
+    def create_global_variable(self, *args, **kwargs):
+        return self.program.global_block().create_var(
+            *args, persistable=False, **kwargs)
+
+    def append_bias_op(self, input_var):
+        size = list(input_var.shape[1:])
+        bias_attr = self.bias_attr()
+        if not bias_attr:
+            return input_var
+
+        b = self.create_parameter(
+            attr=bias_attr, shape=size, dtype=input_var.data_type, suffix='b')
+        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        self.append_op(
+            type='elementwise_add',
+            inputs={'X': [input_var],
+                    'Y': [b]},
+            outputs={'Out': [tmp]})
+        return tmp
+
+    def append_activation(self, input_var):
+        act = self.kwargs.get('act', None)
+        if act is None:
+            return input_var
+        if isinstance(act, basestring):
+            act = {'type': act}
+        tmp = self.create_tmp_variable(dtype=input_var.data_type)
+        act_type = act.pop('type')
+        self.append_op(
+            type=act_type,
+            inputs={"X": [input_var]},
+            outputs={"Y": [tmp]},
+            attrs=act)
+        return tmp
diff --git a/python/paddle/v2/framework/layers.py b/python/paddle/v2/framework/layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..6894c40c3a6514f448133f029c4de8cc30405515
--- /dev/null
+++ b/python/paddle/v2/framework/layers.py
@@ -0,0 +1,443 @@
+from paddle.v2.framework.layer_helper import LayerHelper, unique_name
+import paddle.v2.framework.core as core
+from paddle.v2.framework.framework import OpProtoHolder, Variable, Program
+import re
+
+__all__ = [
+    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
+    'StaticRNN'
+]
+
+
+def fc(input,
+       size,
+       param_attr=None,
+       bias_attr=True,
+       name=None,
+       act=None,
+       num_flatten_dims=1,
+       program=None,
+       init_program=None):
+    # create helper
+    helper = LayerHelper('fc', **locals())
+
+    dtype = helper.input_dtype()
+
+    # mul
+    mul_results = []
+    for input_var, param_attr in helper.iter_inputs_and_params():
+        input_shape = input_var.shape
+        param_shape = [
+            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
+        ] + [size]
+
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype)
+        tmp = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="mul",
+            inputs={
+                "X": input_var,
+                "Y": w,
+            },
+            outputs={"Out": tmp},
+            attrs={'x_num_col_dims': num_flatten_dims,
+                   'y_num_col_dims': 1})
+        mul_results.append(tmp)
+
+    # sum
+    if len(mul_results) == 1:
+        pre_bias = mul_results[0]
+    else:
+        pre_bias = helper.create_tmp_variable(dtype)
+        helper.append_op(
+            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
+    # add bias
+    pre_activation = helper.append_bias_op(pre_bias)
+    # add activation
+    return helper.append_activation(pre_activation)
+
+
+def embedding(input,
+              size,
+              data_type='float32',
+              param_attr=None,
+              program=None,
+              init_program=None):
+    helper = LayerHelper('embedding', **locals())
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=data_type)
+    tmp = helper.create_tmp_variable(data_type)
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input,
+                'W': w},
+        outputs={'Out': tmp})
+    return tmp
+
+
+def data(name,
+         shape,
+         data_type='float32',
+         type=core.VarDesc.VarType.LOD_TENSOR,
+         append_batch_size=True,
+         program=None,
+         init_program=None):
+    helper = LayerHelper('data', **locals())
+    if append_batch_size:
+        shape = [-1] + shape  # append batch size as -1
+    return helper.create_global_variable(
+        name=name, shape=shape, dtype=data_type, type=type)
+
+
+def _convert_(name):
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
+
+
+def _create_op_func_(op_type):
+    op_proto = OpProtoHolder.instance().get_op_proto(op_type)
+    if len(op_proto.outputs) != 1:
+        raise ValueError(
+            "Only one output operator can be automatically generated")
+
+    if op_proto.outputs[0].duplicable:
+        raise ValueError(
+            "Only not duplicable op can be automatically generated")
+
+    o_name = op_proto.outputs[0].name
+
+    def func(**kwargs):
+        helper = LayerHelper(op_type, **kwargs)
+        inputs = dict()
+        dtype = None
+        for ipt in op_proto.inputs:
+            name = _convert_(ipt.name)
+            val = kwargs.pop(name, [])
+            if not isinstance(val, list) and not isinstance(val, tuple):
+                val = [val]
+            for each in val:
+                if not isinstance(each, Variable):
+                    raise ValueError("input of {0} must be variable".format(
+                        op_type))
+
+                if dtype is None:
+                    dtype = each.data_type
+                elif dtype != each.data_type:
+                    raise ValueError(
+                        "operator {0} must input same dtype".format(op_type))
+            inputs[ipt.name] = val
+
+        out = helper.create_tmp_variable(dtype=dtype)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs={o_name: [out]}, attrs=kwargs)
+        return out
+
+    func.__name__ = op_type
+    globals()[op_type] = func
+    global __all__
+    __all__.append(op_type)
+
+
+_create_op_func_('mean')
+_create_op_func_('mul')
+
+
+def concat(input, axis, program=None, init_program=None):
+    helper = LayerHelper('concat', **locals())
+    if not isinstance(input, list) and not isinstance(input, tuple):
+        input = [input]
+    out = helper.create_tmp_variable(dtype=input[0].data_type)
+    helper.append_op(
+        type='concat',
+        inputs={'X': input},
+        outputs={'Out': [out]},
+        attrs={'axis': axis})
+    return out
+
+
+def cross_entropy(input, label, **kwargs):
+    helper = LayerHelper('cross_entropy', **kwargs)
+    out = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='cross_entropy',
+        inputs={'X': [input],
+                'Label': [label]},
+        outputs={'Y': [out]},
+        attrs=kwargs)
+    return out
+
+
+def square_error_cost(input, label, **kwargs):
+    helper = LayerHelper('square_error_cost', **kwargs)
+    minus_out = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input],
+                'Y': [label]},
+        outputs={'Out': [minus_out]})
+
+    square_out = helper.create_tmp_variable(dtype=input.data_type)
+    helper.append_op(
+        type='pow',
+        inputs={'X': [minus_out]},
+        outputs={'Y': [square_out]},
+        attrs={'factor': 2.0})
+    return square_out
+
+
+def conv2d(input,
+           num_filters,
+           name=None,
+           filter_size=[1, 1],
+           act=None,
+           groups=None,
+           stride=[1, 1],
+           padding=None,
+           bias_attr=None,
+           param_attr=None,
+           program=None,
+           init_program=None):
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+
+    num_channels = input.shape[1]
+    if groups is None:
+        num_filter_channels = num_channels
+    else:
+        if num_channels % groups is not 0:
+            raise ValueError("num_channels must be divisible by groups.")
+        num_filter_channels = num_channels / groups
+
+    if isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    if isinstance(stride, int):
+        stride = [stride, stride]
+    if isinstance(padding, int):
+        padding = [padding, padding]
+
+    input_shape = input.shape
+    filter_shape = [num_filters, num_filter_channels] + filter_size
+    filter = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
+    pre_bias = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type='conv2d',
+        inputs={
+            'Input': input,
+            'Filter': filter,
+        },
+        outputs={"Output": pre_bias},
+        attrs={'strides': stride,
+               'paddings': padding,
+               'groups': groups})
+
+    pre_act = helper.append_bias_op(pre_bias)
+
+    return helper.append_activation(pre_act)
+
+
+def pool2d(input,
+           pool_size,
+           pool_type,
+           pool_stride=[1, 1],
+           pool_padding=[0, 0],
+           global_pooling=False,
+           program=None,
+           init_program=None):
+    if pool_type not in ["max", "avg"]:
+        raise ValueError(
+            "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
+            str(pool_type))
+    if isinstance(pool_size, int):
+        pool_size = [pool_size, pool_size]
+    if isinstance(pool_stride, int):
+        pool_stride = [pool_stride, pool_stride]
+    if isinstance(pool_padding, int):
+        pool_padding = [pool_padding, pool_padding]
+
+    helper = LayerHelper('conv2d', **locals())
+    dtype = helper.input_dtype()
+    pool_out = helper.create_tmp_variable(dtype)
+
+    helper.append_op(
+        type="pool2d",
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding
+        })
+
+    return pool_out
+
+
+class BlockGuard(object):
+    """
+    BlockGuard used to create sub-block in program by using Python `with` 
+    keyword.
+    """
+
+    def __init__(self, program):
+        if not isinstance(program, Program):
+            raise TypeError("BlockGuard takes a program")
+        self.program = program
+
+    def __enter__(self):
+        self.program.create_block()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.program.rollback()
+        if exc_type is not None:
+            return False  # re-raise exception
+        return True
+
+
+class StaticRNNGuard(BlockGuard):
+    def __init__(self, rnn):
+        if not isinstance(rnn, StaticRNN):
+            raise TypeError("StaticRNNGuard takes an StaticRNN")
+        super(StaticRNNGuard, self).__init__(rnn.helper.program)
+        self.rnn = rnn
+
+    def __enter__(self):
+        self.rnn.status = StaticRNN.IN_RNN_BLOCK
+        return super(StaticRNNGuard, self).__enter__()
+
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
+        self.rnn.complete_rnn_op()
+        return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb)
+
+
+class StaticRNNMemoryLink(object):
+    """
+    :param init: the initial variable for Memory
+    :type init: Variable
+    :param pre_mem: the memory variable in previous time step
+    :type pre_mem: Variable
+    :param mem: the memory variable in current time step
+    :type mem: Variable
+    """
+
+    def __init__(self, init, pre_mem, mem=None):
+        self.init = init
+        self.pre_mem = pre_mem
+        self.mem = mem
+
+
+class StaticRNN(object):
+    BEFORE_RNN_BLOCK = 0
+    IN_RNN_BLOCK = 1
+    AFTER_RNN_BLOCK = 2
+
+    def __init__(self, name=None, program=None):
+        self.helper = LayerHelper("static_rnn", name=name, program=program)
+        self.memories = {}  # memory map, from pre_mem.name --> MemoryLink
+        self.inputs = []  # input variable list in current block
+        self.outputs = []  # output variable list in parent block
+        self.status = StaticRNN.BEFORE_RNN_BLOCK  # status flag.
+        # sequence length, since it is a static RNN, sequence length are fixed.
+        self.seq_len = None
+
+    def step(self):
+        return StaticRNNGuard(self)
+
+    def _assert_in_rnn_block_(self, method):
+        if self.status != StaticRNN.IN_RNN_BLOCK:
+            raise ValueError("You must invoke {0} in rnn block".format(method))
+
+    def memory(self, init=None, shape=None, dtype=None, init_value=0):
+        self._assert_in_rnn_block_('memory')
+        if init is None:
+            if shape is None or dtype is None:
+                raise ValueError(
+                    "if init is None, memory at least need shape and dtype")
+            parent_block = self.parent_block()
+            var_name = unique_name("@".join([self.helper.name, "memory_boot"]))
+            boot_var = parent_block.create_var(
+                name=var_name, shape=shape, dtype=dtype, persistable=False)
+
+            parent_block.append_op(
+                type="fill_constant",
+                inputs={},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'data_type': boot_var.data_type
+                })
+
+            return self.memory(init=boot_var)
+        else:
+            pre_mem = self.helper.create_variable(
+                name=unique_name("@".join([self.helper.name, "mem"])),
+                dtype=init.data_type,
+                shape=init.shape)
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem)
+            return pre_mem
+
+    def step_input(self, x):
+        self._assert_in_rnn_block_('step_input')
+        if not isinstance(x, Variable):
+            raise TypeError("step input takes a Variable")
+        if self.seq_len is None:
+            self.seq_len = x.shape[1]
+        elif self.seq_len != x.shape[1]:
+            raise ValueError("Static RNN only take fix seq_len input")
+
+        ipt = self.helper.create_variable(
+            name=x.name,
+            dtype=x.data_type,
+            shape=[-1] + list(x.shape[2:]),
+            type=x.type)
+        self.inputs.append(ipt)
+        return ipt
+
+    def step_output(self, o):
+        self._assert_in_rnn_block_('step_output')
+        if not isinstance(o, Variable):
+            raise TypeError("step output takes a Variable")
+
+        out_var = self.parent_block().create_var(
+            name=o.name,
+            shape=[-1, self.seq_len] + list(o.shape[1:]),
+            dtype=o.data_type)
+
+        self.outputs.append(out_var)
+
+    def output(self, *outputs):
+        for each in outputs:
+            self.step_output(each)
+
+    def update_memory(self, mem, var):
+        if not isinstance(mem, Variable) or not isinstance(var, Variable):
+            raise TypeError("update memory should take variables")
+        self.memories[mem.name].mem = var
+
+    def parent_block(self):
+        prog = self.helper.program
+        parent_idx = prog.current_block().parent_idx
+        assert parent_idx >= 0
+        parent_block = prog.block(parent_idx)
+        return parent_block
+
+    def __call__(self, *args, **kwargs):
+        if self.status != StaticRNN.AFTER_RNN_BLOCK:
+            raise ValueError("RNN output can only be retrieved after rnn block")
+        if len(self.outputs) == 0:
+            raise ValueError("RNN has no output")
+        elif len(self.outputs) == 1:
+            return self.outputs[0]
+        else:
+            return self.outputs
+
+    def complete_rnn_op(self):
+        # TODO(yuyang18): Create RNN Op here.
+        # Implement this method after RNN op complete.
+        pass
diff --git a/python/paddle/v2/framework/nets.py b/python/paddle/v2/framework/nets.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a83ebfb9639f6fae6344b68509a80580881dab0
--- /dev/null
+++ b/python/paddle/v2/framework/nets.py
@@ -0,0 +1,27 @@
+import paddle.v2.framework.layers as layers
+
+
+def simple_img_conv_pool(input,
+                         filter_size,
+                         num_filters,
+                         pool_size,
+                         pool_stride,
+                         act,
+                         program=None,
+                         init_program=None):
+    conv_out = layers.conv2d(
+        input=input,
+        num_filters=num_filters,
+        filter_size=filter_size,
+        act=act,
+        program=program,
+        init_program=init_program)
+
+    pool_out = layers.pool2d(
+        input=conv_out,
+        pool_size=pool_size,
+        pool_type='max',
+        pool_stride=pool_stride,
+        program=program,
+        init_program=init_program)
+    return pool_out
diff --git a/python/paddle/v2/framework/op.py b/python/paddle/v2/framework/op.py
index 9086a5cc3452b178ec37fe6a3e358eaa4c5d606b..bc771a964adf9f97cbeae87c06ce954c76051150 100644
--- a/python/paddle/v2/framework/op.py
+++ b/python/paddle/v2/framework/op.py
@@ -219,6 +219,27 @@ class __RecurrentOp__(object):
         return core.RecurrentOp.create(proto.SerializeToString())
 
 
+class __DynamicRecurrentOp__(object):
+    __proto__ = None
+    type = "dynamic_recurrent"
+
+    def __init__(self):
+        # cache recurrent_op's proto
+        if self.__proto__ is None:
+            for op_proto in get_all_op_protos():
+                if op_proto.type == self.type:
+                    self.__proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        if self.type not in args and "type" not in kwargs:
+            kwargs["type"] = self.type
+        # create proto
+        create_method = OpDescCreationMethod(self.__proto__)
+        proto = create_method(*args, **kwargs)
+        # create rnnop
+        return core.DynamicRecurrentOp.create(proto.SerializeToString())
+
+
 class __CondOp__(object):
     __proto__ = None
     type = "cond"
@@ -242,4 +263,5 @@ class __CondOp__(object):
 
 Operator = OperatorFactory()  # The default global factory
 RecurrentOp = __RecurrentOp__()
+DynamicRecurrentOp = __DynamicRecurrentOp__()
 CondOp = __CondOp__()
diff --git a/python/paddle/v2/framework/optimizer.py b/python/paddle/v2/framework/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a86908c64897eb4e01f3c99a66b4da27a5f3394b
--- /dev/null
+++ b/python/paddle/v2/framework/optimizer.py
@@ -0,0 +1,442 @@
+from collections import defaultdict
+
+import paddle.v2.framework.framework as framework
+from paddle.v2.framework.backward import append_backward_ops
+
+__all__ = [
+    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer'
+]
+
+
+class Optimizer(object):
+    """Optimizer Base class.
+
+    Define the common interface of an optimizer.
+    User should not use this class directly,
+    but need to use one of it's implementation.
+    """
+
+    def __init__(self):
+        # Dictionary of accumulators. Some optimizer subclasses need to
+        # allocate and manage extra variables associated with the parameters
+        # to train. These variables are called accumulators.
+        # {accum_name : { paramter_name : accumulator_for_parameter, ...}, ...}
+        self._accumulators = defaultdict(lambda: dict())
+
+    def _append_optimize_op(self, block, param_and_grad):
+        """ append optimize operator to block and return all the added optimize_op
+        """
+        raise NotImplementedError()
+
+    def _initialize_tensors(self, block):
+        """Create all necessary tensors, that will be shared for all parameter updates.
+
+        Tensors like learning rate should be initialized here.
+
+        Args:
+            block: the block in which the loss variable is present
+        """
+        pass
+
+    def _create_accumulators(self, block, parameters):
+        """Create all accumulators needed by the parameters
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
+        """
+        pass
+
+    def _finish_update(self, block):
+        """Finish any custom updates needed
+           before completing an optimization step
+
+        Args:
+            block: the block in which the loss variable is present
+            parameters: list of parameter variables for the optimizer
+
+        Returns:
+            list of finish ops or None
+        """
+        pass
+
+    def _add_accumulator(self, block, name, param, dtype=None, fill_value=0.0):
+        """Utility function to add an accumulator for a parameter
+
+        Args:
+            block: the block in which the loss variable is present
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be added
+            dtype: data type of the accumulator variable
+            fill_value: value to initialize the accumulator variable
+        """
+        if (name in self._accumulators and
+                param.name in self._accumulators[name]):
+            raise Exception("Accumulator {} already exists for parmeter {}".
+                            format(name, param.name))
+        global_block = block.program.global_block()
+        param_shape = list(param.shape)
+        param_acc = global_block.create_var(
+            dtype=dtype, shape=param_shape, lod_level=0)
+
+        # Initialize the accumulator with fill_value
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        global_block.append_op(
+            type="fill_constant",
+            outputs={"Out": param_acc},
+            attrs={"shape": param_shape,
+                   "value": fill_value})
+
+        # Add to accumulators dict
+        self._accumulators[name][param.name] = param_acc
+
+    def _get_accumulator(self, name, param):
+        """Utility function to fetch an accumulator for a parameter
+
+        Args:
+            name: name of the accumulator
+            param: parameter variable for which accumulator is to be fetched
+
+        Returns:
+            accumulator variable for the parameter
+        """
+        if (name not in self._accumulators or
+                param.name not in self._accumulators[name]):
+            raise Exception("Accumulator {} does not exist for parameter {}".
+                            format(name, param.name))
+        return self._accumulators[name][param.name]
+
+    def create_optimization_pass(self, parameters_and_grads, loss):
+        """Add optimization operators to update gradients to variables.
+
+        Args:
+          loss: the target that this optimization is for.
+          parameters_and_grads: a list of (variable, gradient) pair to update.
+
+        Returns:
+          return_op_list: a list of operators that will complete one step of
+          optimization. This will include parameter update ops, global step
+          update ops and any other custom ops required by subclasses to manage
+          their internal state.
+        """
+        # This is a default implementation of create_optimization_pass that
+        # can be shared by most optimizers. This implementation assumes that
+        # the subclass will implement the _append_optimize_op method and the
+        #  _initialize_tensors method. The subclass can extend the
+        # _create_accumulators method if it needs to create accumulators
+        # for parameters and extend _finish_update method to add custom ops.
+
+        # Create any accumulators
+        self._create_accumulators(loss.block,
+                                  [p[0] for p in parameters_and_grads])
+        # Create any necessary tensors
+        self._initialize_tensors(loss.block)
+
+        optimize_ops = []
+        for param_and_grad in parameters_and_grads:
+            if param_and_grad[1] is not None:
+                optimize_op = self._append_optimize_op(loss.block,
+                                                       param_and_grad)
+                optimize_ops.append(optimize_op)
+
+        # Returned list of ops can include more ops in addition
+        # to optimization ops
+        return_ops = optimize_ops
+
+        # Get custom finish ops for subclasses
+        # FIXME: Need to fix this once we figure out how to handle dependencies
+        finish_ops = self._finish_update(loss.block)
+        if finish_ops is not None:
+            return_ops += finish_ops
+
+        return return_ops
+
+    def minimize(self, loss, parameter_list=None, no_grad_set=None):
+        """Add operations to minimize `loss` by updating `parameter_list`.
+
+        This method combines interface `append_backward_ops()` and
+        `create_optimization_pass()` into one.
+        """
+        params_grads = append_backward_ops(loss, parameter_list, no_grad_set or
+                                           set())
+        optimize_ops = self.create_optimization_pass(params_grads, loss)
+        return optimize_ops
+
+
+class SGDOptimizer(Optimizer):
+    """ Simple SGD optimizer without any state.
+    """
+
+    def __init__(self, learning_rate):
+        assert learning_rate is not None
+        super(SGDOptimizer, self).__init__()
+        self.type = "sgd"
+        self._learning_rate = learning_rate
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        # create the optimize op
+        sgd_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._lr
+            },
+            outputs={"ParamOut": param_and_grad[0]})
+
+        return sgd_op
+
+
+class MomentumOptimizer(Optimizer):
+    """Simple Momentum optimizer with velocity state
+    """
+    _velocity_acc_str = "velocity"
+
+    def __init__(self, learning_rate, momentum):
+        assert learning_rate is not None
+        assert momentum is not None
+        super(MomentumOptimizer, self).__init__()
+        self.type = "momentum"
+        self._learning_rate = learning_rate
+        self._momentum = momentum
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(block, self._velocity_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the momentum optimize op
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._lr
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={"mu": self._momentum})
+
+        return momentum_op
+
+
+class AdagradOptimizer(Optimizer):
+    """Simple Adagrad optimizer with moment state
+    """
+    _moment_acc_str = "moment"
+
+    def __init__(self, learning_rate, epsilon=1.0e-6):
+        assert learning_rate is not None
+        assert epsilon is not None
+        super(AdagradOptimizer, self).__init__()
+        self.type = "adagrad"
+        self._learning_rate = learning_rate
+        self._epsilon = epsilon
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        for p in parameters:
+            self._add_accumulator(block, self._moment_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment_acc = self._get_accumulator(self._moment_acc_str,
+                                           param_and_grad[0])
+
+        # create the adagrad optimizer op
+        adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._lr
+            },
+            outputs={"ParamOut": param_and_grad[0],
+                     "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon})
+
+        return adagrad_op
+
+
+class AdamOptimizer(Optimizer):
+    """Implements the Adam Optimizer
+    """
+    _moment1_acc_str = "moment1"
+    _moment2_acc_str = "moment2"
+
+    def __init__(self,
+                 learning_rate=0.001,
+                 beta1=0.9,
+                 beta2=0.999,
+                 epsilon=1e-8):
+        assert learning_rate is not None
+        assert beta1 is not None
+        assert beta2 is not None
+        assert epsilon is not None
+        super(AdamOptimizer, self).__init__()
+        self.type = "adam"
+        self._learning_rate = learning_rate
+        self._beta1 = beta1
+        self._beta2 = beta2
+        self._epsilon = epsilon
+
+    def _initialize_tensors(self, block):
+        assert isinstance(block, framework.Block)
+        lr_shape = [1]
+        # create a variable for learning_rate
+        self._lr = block.create_var(
+            dtype="float32", shape=lr_shape, lod_level=0)
+
+        # create an op to init the learning_rate
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._lr},
+            attrs={"shape": lr_shape,
+                   "value": self._learning_rate})
+
+    def _create_accumulators(self, block, parameters):
+        assert isinstance(block, framework.Block)
+
+        global_block = block.program.global_block()
+        # Create beta1 and beta2 power tensors
+        beta_shape = [1]
+        # Create variables for beta1 and beta2 powers
+        self._beta1_pow_acc = global_block.create_var(
+            dtype="float32", shape=beta_shape, lod_level=0)
+        self._beta2_pow_acc = global_block.create_var(
+            dtype="float32", shape=beta_shape, lod_level=0)
+
+        # Initialize beta1 and beta2 power accumulators
+        # FIXME: Fix when Initialization design has been implemented
+        # https://github.com/PaddlePaddle/Paddle/pull/4852
+        global_block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._beta1_pow_acc},
+            attrs={"shape": beta_shape,
+                   "value": self._beta1})
+        global_block.append_op(
+            type="fill_constant",
+            outputs={"Out": self._beta2_pow_acc},
+            attrs={"shape": beta_shape,
+                   "value": self._beta2})
+
+        # Create accumulator tensors for first and second moments
+        for p in parameters:
+            self._add_accumulator(block, self._moment1_acc_str, p, 'float32')
+            self._add_accumulator(block, self._moment2_acc_str, p, 'float32')
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        moment1 = self._get_accumulator(self._moment1_acc_str,
+                                        param_and_grad[0])
+        moment2 = self._get_accumulator(self._moment2_acc_str,
+                                        param_and_grad[0])
+        # create the momentum optimize op
+        adam_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "LearningRate": self._lr,
+                "Moment1": moment1,
+                "Moment2": moment2,
+                "Beta1Pow": self._beta1_pow_acc,
+                "Beta2Pow": self._beta2_pow_acc
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "Moment1Out": moment1,
+                "Moment2Out": moment2
+            },
+            attrs={
+                "beta1": self._beta1,
+                "beta2": self._beta2,
+                "epsilon": self._epsilon
+            })
+
+        return adam_op
+
+    def _finish_update(self, block):
+        """Update Beta1 and Beta2 Power accumulators
+        """
+        assert isinstance(block, framework.Block)
+        global_block = block.program.global_block()
+        scale_beta1 = global_block.append_op(
+            type="scale",
+            inputs={"X": self._beta1_pow_acc},
+            outputs={"Out": self._beta1_pow_acc},
+            attrs={"scale": self._beta1})
+
+        scale_beta2 = global_block.append_op(
+            type="scale",
+            inputs={"X": self._beta2_pow_acc},
+            outputs={"Out": self._beta2_pow_acc},
+            attrs={"scale": self._beta2})
+
+        return [scale_beta1, scale_beta2]
diff --git a/python/paddle/v2/framework/tests/.gitignore b/python/paddle/v2/framework/tests/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..28433306d49112cc860f4ace9efca2b2d70deb3f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/.gitignore
@@ -0,0 +1 @@
+image/
diff --git a/python/paddle/v2/framework/tests/op_test.py b/python/paddle/v2/framework/tests/op_test.py
index 89979044f29a301daa7435ff903ae902c981ea1b..a7de01dcddd65b6f0f064e6ce6fcb3e5cad73931 100644
--- a/python/paddle/v2/framework/tests/op_test.py
+++ b/python/paddle/v2/framework/tests/op_test.py
@@ -1,8 +1,11 @@
 import unittest
 import numpy as np
+import random
 import itertools
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
+from paddle.v2.framework.executor import Executor
+from paddle.v2.framework.framework import Program, OpProtoHolder
 
 
 def grad_var_name(var_name):
@@ -12,17 +15,19 @@ def grad_var_name(var_name):
 def create_op(scope, op_type, inputs, outputs, attrs):
     kwargs = dict()
 
+    def __create_var__(name, var_name):
+        scope.var(var_name)
+        kwargs[name].append(var_name)
+
     for in_name, in_dup in Operator.get_op_inputs(op_type):
         if in_name in inputs:
             kwargs[in_name] = []
             if in_dup:
                 sub_in = inputs[in_name]
                 for sub_in_name, _ in sub_in:
-                    var = scope.new_var(sub_in_name)
-                    kwargs[in_name].append(sub_in_name)
+                    __create_var__(in_name, sub_in_name)
             else:
-                var = scope.new_var(in_name)
-                kwargs[in_name].append(in_name)
+                __create_var__(in_name, in_name)
 
     for out_name, out_dup in Operator.get_op_outputs(op_type):
         if out_name in outputs:
@@ -30,11 +35,9 @@ def create_op(scope, op_type, inputs, outputs, attrs):
             if out_dup:
                 sub_out = outputs[out_name]
                 for sub_out_name, _ in sub_out:
-                    var = scope.new_var(sub_out_name)
-                    kwargs[out_name].append(sub_out_name)
+                    __create_var__(out_name, sub_out_name)
             else:
-                var = scope.new_var(out_name)
-                kwargs[out_name].append(out_name)
+                __create_var__(out_name, out_name)
 
     for attr_name in Operator.get_op_attr_names(op_type):
         if attr_name in attrs:
@@ -44,49 +47,51 @@ def create_op(scope, op_type, inputs, outputs, attrs):
 
 
 def set_input(scope, op, inputs, place):
+    def __set_input__(var_name, var):
+        if isinstance(var, tuple) or isinstance(var, np.ndarray):
+            tensor = scope.find_var(var_name).get_tensor()
+            if isinstance(var, tuple):
+                tensor.set_lod(var[1])
+                var = var[0]
+            tensor.set_dims(var.shape)
+            tensor.set(var, place)
+        elif isinstance(var, float):
+            scope.find_var(var_name).set_float(var)
+        elif isinstance(var, int):
+            scope.find_var(var_name).set_int(var)
+
     for in_name, in_dup in Operator.get_op_inputs(op.type()):
         if in_name in inputs:
             if in_dup:
                 sub_in = inputs[in_name]
                 for sub_in_name, sub_in_val in sub_in:
-                    var = scope.find_var(sub_in_name)
-                    tensor = var.get_tensor()
-                    sub_in_array = sub_in_val[0] \
-                        if isinstance(sub_in_val, tuple) else sub_in_val
-                    tensor.set_dims(sub_in_array.shape)
-                    tensor.set(sub_in_array, place)
-                    if isinstance(sub_in_val, tuple):
-                        tensor.set_lod(sub_in_val[1])
+                    __set_input__(sub_in_name, sub_in_val)
             else:
-                var = scope.find_var(in_name)
-                tensor = var.get_tensor()
-                in_val = inputs[in_name]
-                in_array = in_val[0] if isinstance(in_val, tuple) else in_val
-                tensor.set_dims(in_array.shape)
-                tensor.set(in_array, place)
-                if isinstance(in_val, tuple):
-                    tensor.set_lod(in_val[1])
+                __set_input__(in_name, inputs[in_name])
 
 
 def set_output_grad(scope, op, outputs, place):
+    def __set_tensor__(name):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if out_dtype == core.DataType.FP64:
+            data = np.ones(out_tensor.shape(), dtype=np.float64)
+        elif out_dtype == core.DataType.FP32:
+            data = np.ones(out_tensor.shape(), dtype=np.float32)
+        else:
+            raise ValueError("Not supported data type " + str(out_dtype))
+
+        grad_tensor.set(data, place)
+
     for out_name, out_dup in Operator.get_op_outputs(op.type()):
         if out_name in outputs:
             if out_dup:
                 sub_out = outputs[out_name]
                 for sub_out_name, _ in sub_out:
-                    out_tensor = scope.find_var(sub_out_name).get_tensor()
-                    grad_tensor = scope.new_var(grad_var_name(
-                        sub_out_name)).get_tensor()
-                    grad_tensor.set_dims(out_tensor.shape())
-                    data = np.ones(out_tensor.shape(), dtype=np.float32)
-                    grad_tensor.set(data, place)
+                    __set_tensor__(sub_out_name)
             else:
-                out_tensor = scope.find_var(out_name).get_tensor()
-                grad_tensor = scope.new_var(grad_var_name(out_name)).get_tensor(
-                )
-                grad_tensor.set_dims(out_tensor.shape())
-                data = np.ones(out_tensor.shape(), dtype=np.float32)
-                grad_tensor.set(data, place)
+                __set_tensor__(out_name)
 
 
 def get_numeric_gradient(scope,
@@ -96,7 +101,6 @@ def get_numeric_gradient(scope,
                          output_names,
                          delta=0.005,
                          in_place=False):
-
     set_input(scope, op, inputs, core.CPUPlace())
 
     tensor_to_check = scope.find_var(input_to_check).get_tensor()
@@ -115,7 +119,29 @@ def get_numeric_gradient(scope,
 
     tensor_to_check = scope.find_var(input_to_check).get_tensor()
     tensor_size = product(tensor_to_check.get_dims())
-    gradient_flat = np.zeros(shape=(tensor_size, ), dtype='float32')
+    tensor_to_check_dtype = tensor_to_check.dtype()
+    if tensor_to_check_dtype == core.DataType.FP32:
+        tensor_to_check_dtype = np.float32
+    elif tensor_to_check_dtype == core.DataType.FP64:
+        tensor_to_check_dtype = np.float64
+    else:
+        raise ValueError("Not supported data type " + str(
+            tensor_to_check_dtype))
+
+    gradient_flat = np.zeros(shape=(tensor_size, ), dtype=tensor_to_check_dtype)
+
+    def __get_elem__(tensor, i):
+        if tensor_to_check_dtype == np.float32:
+            return tensor.get_float_element(i)
+        else:
+            return tensor.get_double_element(i)
+
+    def __set_elem__(tensor, i, e):
+        if tensor_to_check_dtype == np.float32:
+            tensor.set_float_element(i, e)
+        else:
+            tensor.set_double_element(i, e)
+
     # we only compute gradient of one element each time.
     # we use a for loop to compute the gradient of every element.
     for i in xrange(tensor_size):
@@ -123,20 +149,20 @@ def get_numeric_gradient(scope,
             set_input(scope, op, inputs, core.CPUPlace())
 
         # get one input element throw it's index i.
-        origin = tensor_to_check.get_float_element(i)
+        origin = __get_elem__(tensor_to_check, i)
         # add delta to it, run op and then get the sum of the result tensor.
         x_pos = origin + delta
-        tensor_to_check.set_float_element(i, x_pos)
+        __set_elem__(tensor_to_check, i, x_pos)
         y_pos = get_output()
 
         if in_place:
             set_input(scope, op, inputs, core.CPUPlace())
 
         x_neg = origin - delta
-        tensor_to_check.set_float_element(i, x_neg)
+        __set_elem__(tensor_to_check, i, x_neg)
         y_neg = get_output()
 
-        tensor_to_check.set_float_element(i, origin)
+        __set_elem__(tensor_to_check, i, origin)
         gradient_flat[i] = (y_pos - y_neg) / delta / 2
 
     return gradient_flat.reshape(tensor_to_check.get_dims())
@@ -145,15 +171,20 @@ def get_numeric_gradient(scope,
 def get_backward_op(scope, op, no_grad_set):
     backward_op = core.Operator.backward(op, no_grad_set)
     for input in backward_op.input_vars():
-        var = scope.new_var(input)
+        var = scope.var(input)
         var.get_tensor()
     for output in backward_op.output_vars():
-        var = scope.new_var(output)
+        var = scope.var(output)
         var.get_tensor()
     return backward_op
 
 
-def get_gradient(scope, op, inputs, outputs, grad_name, place,
+def get_gradient(scope,
+                 op,
+                 inputs,
+                 outputs,
+                 grad_names,
+                 place,
                  no_grad_set=None):
     ctx = core.DeviceContext.create(place)
 
@@ -169,53 +200,168 @@ def get_gradient(scope, op, inputs, outputs, grad_name, place,
 
     backward_op.run(scope, ctx)
 
-    out = np.array(scope.find_var(grad_name).get_tensor())
-    return out
+    return [
+        np.array(scope.find_var(grad_name).get_tensor())
+        for grad_name in grad_names
+    ]
+
+
+def append_input_output(block, op_proto, np_list, is_input):
+    '''Insert VarDesc and generate Python variable instance'''
+    proto_list = op_proto.inputs if is_input else op_proto.outputs
+
+    def create_var(block, name, np_list, var_proto):
+        if name not in np_list:
+            assert var_proto.intermediate, "{} not found".format(name)
+            shape = None
+            lod_level = None
+        else:
+            np_value = np_list[name]
+            if isinstance(np_value, tuple):
+                shape = list(np_value[0].shape)
+                lod_level = len(np_value[1])
+            else:
+                shape = list(np_value.shape)
+                lod_level = 0
+        return block.create_var(
+            dtype="float32", shape=shape, lod_level=lod_level, name=name)
+
+    var_dict = {}
+    for var_proto in proto_list:
+        var_name = str(var_proto.name)
+        if is_input:
+            if (var_name not in np_list) and var_proto.dispensable:
+                continue
+            assert (var_name in np_list) or (var_proto.dispensable), \
+                            "Missing {} as input".format(var_name)
+        if var_proto.duplicable:
+            assert isinstance(np_list[var_name], list), \
+                "Duplicable {} should be set as list".format(var_name)
+            var_list = []
+            for (name, np_value) in np_list[var_name]:
+                var_list.append(
+                    create_var(block, name, {name: np_value}, var_proto))
+            var_dict[var_name] = var_list
+        else:
+            var_dict[var_name] = create_var(block, var_name, np_list, var_proto)
+
+    return var_dict
 
 
 class OpTest(unittest.TestCase):
+    @classmethod
+    def setUpClass(cls):
+        '''Fix random seeds to remove randomness from tests'''
+        cls._np_rand_state = np.random.get_state()
+        cls._py_rand_state = random.getstate()
+
+        np.random.seed(123)
+        random.seed(124)
+
+    @classmethod
+    def tearDownClass(cls):
+        '''Restore random seeds'''
+        np.random.set_state(cls._np_rand_state)
+        random.setstate(cls._py_rand_state)
+
+    def feed_var(self, input_vars, place):
+        feed_map = {}
+        for var_name in input_vars:
+            if isinstance(input_vars[var_name], list):
+                for name, np_value in self.inputs[var_name]:
+                    tensor = core.LoDTensor()
+                    tensor.set(np_value, place)
+                    feed_map[name] = tensor
+            else:
+                tensor = core.LoDTensor()
+                if isinstance(self.inputs[var_name], tuple):
+                    tensor.set(self.inputs[var_name][0], place)
+                    tensor.set_lod(self.inputs[var_name][1])
+                else:
+                    tensor.set(self.inputs[var_name], place)
+                feed_map[var_name] = tensor
+
+        return feed_map
+
     def check_output_with_place(self, place, atol):
-        self.scope = core.Scope()
-        op_inputs = self.inputs if hasattr(self, "inputs") else dict()
-        op_outputs = self.outputs if hasattr(self, "outputs") else dict()
-        op_attrs = self.attrs if hasattr(self, "attrs") else dict()
-        self.op = create_op(self.scope, self.op_type, op_inputs, op_outputs,
-                            op_attrs)
-        if isinstance(place, core.GPUPlace) and not self.op.support_gpu():
-            return
-        set_input(self.scope, self.op, self.inputs, place)
-        ctx = core.DeviceContext.create(place)
-        self.op.run(self.scope, ctx)
+        op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
+
+        program = Program()
+        block = program.global_block()
+
+        inputs = append_input_output(block, op_proto, self.inputs, True)
+        outputs = append_input_output(block, op_proto, self.outputs, False)
+
+        op = block.append_op(
+            type=self.op_type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=self.attrs if hasattr(self, "attrs") else dict())
+
+        fetch_list = []
+        for var_name, var in outputs.iteritems():
+            if var_name in self.outputs:
+                if isinstance(var, list):
+                    for v in var:
+                        fetch_list.append(v)
+                else:
+                    fetch_list.append(var)
+
+        feed_map = self.feed_var(inputs, place)
 
-        for out_name, out_dup in Operator.get_op_outputs(self.op.type()):
+        exe = Executor(place)
+        outs = exe.run(program, feed=feed_map, fetch_list=fetch_list)
+
+        for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
                 continue
 
+            def find_actual(target_name, fetch_list):
+                found = [
+                    i for i, var in enumerate(fetch_list)
+                    if var.name == target_name
+                ]
+                self.assertTrue(
+                    len(found) == 1, "Found {} {}".format(
+                        len(found), target_name))
+                return found[0]
+
             if out_dup:
                 sub_out = self.outputs[out_name]
                 if not isinstance(sub_out, list):
                     raise AssertionError("sub_out type %s is not list",
                                          type(sub_out))
-
                 for sub_out_name, expect in sub_out:
-                    actual = np.array(
-                        self.scope.find_var(sub_out_name).get_tensor())
+                    idx = find_actual(sub_out_name, fetch_list)
+                    actual_t = np.array(outs[idx])
+                    expect_t = expect[0] \
+                        if isinstance(expect, tuple) else expect
                     self.assertTrue(
                         np.allclose(
-                            actual, expect, atol=atol),
-                        "output name: " + out_name + " has diff.")
+                            actual_t, expect_t, atol=atol),
+                        "Output (" + sub_out_name + ") has diff at " +
+                        str(place))
+                    if isinstance(expect, tuple):
+                        self.assertListEqual(
+                            actual_t.lod(), expect[1], "Output (" + sub_out_name
+                            + ") has different lod at " + str(place))
             else:
-                actual = np.array(self.scope.find_var(out_name).get_tensor())
+                idx = find_actual(out_name, fetch_list)
+                actual_t = outs[idx]
                 expect = self.outputs[out_name]
-
+                expect_t = expect[0] if isinstance(expect, tuple) else expect
                 self.assertTrue(
                     np.allclose(
-                        actual, expect, atol=atol),
-                    "output name: " + out_name + " has diff.")
+                        actual_t, expect_t, atol=atol),
+                    "Output (" + out_name + ") has diff at " + str(place))
+                if isinstance(expect, tuple):
+                    self.assertListEqual(actual_t.lod(), expect[1],
+                                         "Output (" + out_name +
+                                         ") has different lod at " + str(place))
 
     def check_output(self, atol=1e-5):
         places = [core.CPUPlace()]
-        if core.is_compile_gpu():
+        if core.is_compile_gpu() and core.op_support_gpu(self.op_type):
             places.append(core.GPUPlace(0))
         for place in places:
             self.check_output_with_place(place, atol)
@@ -244,7 +390,8 @@ class OpTest(unittest.TestCase):
                    output_names,
                    no_grad_set=None,
                    in_place=False,
-                   max_relative_error=0.005):
+                   max_relative_error=0.005,
+                   user_defined_grads=None):
         self.scope = core.Scope()
         op_inputs = self.inputs if hasattr(self, "inputs") else dict()
         op_outputs = self.outputs if hasattr(self, "outputs") else dict()
@@ -257,7 +404,7 @@ class OpTest(unittest.TestCase):
         if not type(output_names) is list:
             output_names = [output_names]
 
-        numeric_grads = [
+        numeric_grads = user_defined_grads or [
             get_numeric_gradient(
                 self.scope,
                 self.op,
@@ -271,11 +418,9 @@ class OpTest(unittest.TestCase):
         ]
 
         cpu_place = core.CPUPlace()
-        cpu_analytic_grads = [
-            get_gradient(self.scope, self.op, self.inputs, self.outputs,
-                         grad_name, cpu_place, no_grad_set)
-            for grad_name in grad_names
-        ]
+        cpu_analytic_grads = get_gradient(self.scope, self.op, self.inputs,
+                                          self.outputs, grad_names, cpu_place,
+                                          no_grad_set)
 
         self.__assert_is_close(numeric_grads, cpu_analytic_grads, grad_names,
                                max_relative_error,
@@ -283,11 +428,9 @@ class OpTest(unittest.TestCase):
 
         if core.is_compile_gpu() and self.op.support_gpu():
             gpu_place = core.GPUPlace(0)
-            gpu_analytic_grads = [
-                get_gradient(self.scope, self.op, self.inputs, self.outputs,
-                             grad_name, gpu_place, no_grad_set)
-                for grad_name in grad_names
-            ]
+            gpu_analytic_grads = get_gradient(self.scope, self.op, self.inputs,
+                                              self.outputs, grad_names,
+                                              gpu_place, no_grad_set)
 
             self.__assert_is_close(numeric_grads, gpu_analytic_grads,
                                    grad_names, max_relative_error,
diff --git a/python/paddle/v2/framework/tests/test_accuracy_op.py b/python/paddle/v2/framework/tests/test_accuracy_op.py
index b6f3a35d6f58ba90b39e3f6296ae635220a2e965..02be9a02910bee3eae63e12cceaa51cf53591539 100644
--- a/python/paddle/v2/framework/tests/test_accuracy_op.py
+++ b/python/paddle/v2/framework/tests/test_accuracy_op.py
@@ -16,7 +16,9 @@ class TestAccuracyOp(OpTest):
                 if ele == label[rowid]:
                     num_correct += 1
                     break
-        self.outputs = {'Accuracy': [num_correct / float(n)]}
+        self.outputs = {
+            'Accuracy': np.array([num_correct / float(n)]).astype("float32")
+        }
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_activation_op.py b/python/paddle/v2/framework/tests/test_activation_op.py
index 8f6d2be17758b7f6604d2db74fe466fb30695bd5..c1668cd00ff6c3782dd17a789e4ad93b92e5209d 100644
--- a/python/paddle/v2/framework/tests/test_activation_op.py
+++ b/python/paddle/v2/framework/tests/test_activation_op.py
@@ -33,6 +33,21 @@ class TestSigmoid(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.008)
 
 
+class TestLogSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "logsigmoid"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.log(1 / (1 + np.exp(-self.inputs['X'])))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
 class TestTanh(OpTest):
     def setUp(self):
         self.op_type = "tanh"
@@ -48,6 +63,61 @@ class TestTanh(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestTanhShrink(OpTest):
+    def setUp(self):
+        self.op_type = "tanh_shrink"
+        self.inputs = {
+            'X': np.random.uniform(0.1, 1, [10, 17]).astype("float32")
+        }
+        self.outputs = {'Y': self.inputs['X'] - np.tanh(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.008)
+
+
+class TestHardShrink(OpTest):
+    def setUp(self):
+        self.op_type = "hard_shrink"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        threshold = 0.5
+
+        self.inputs = {'X': x}
+        self.attrs = {'lambda': threshold}
+
+        t = np.copy(x)
+        t[(t >= -threshold) & (t <= threshold)] = 0
+        self.outputs = {'Y': t}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.005)
+
+
+class TestSoftShrink(OpTest):
+    def setUp(self):
+        self.op_type = "softshrink"
+        lambda_val = 0.1
+        self.attrs = {'lambda': lambda_val}
+        self.inputs = {
+            'X': np.random.uniform(0.25, 10, [4, 4]).astype("float32")
+        }
+        y = np.copy(self.inputs['X'])
+        y = (y < -lambda_val) * (y + lambda_val) + (y > lambda_val) * (
+            y - lambda_val)
+        self.outputs = {'Y': y}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestSqrt(OpTest):
     def setUp(self):
         self.op_type = "sqrt"
@@ -102,8 +172,8 @@ class TestBRelu(OpTest):
     def setUp(self):
         self.op_type = "brelu"
         x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
-        t_min = 1
-        t_max = 4
+        t_min = 1.0
+        t_max = 4.0
         # The same with TestAbs
         x[np.abs(x - t_min) < 0.005] = t_min + 0.02
         x[np.abs(x - t_max) < 0.005] = t_max + 0.02
@@ -122,11 +192,33 @@ class TestBRelu(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.02)
 
 
+class TestRelu6(OpTest):
+    def setUp(self):
+        self.op_type = "relu6"
+        x = np.random.uniform(-1, 1, [4, 10]).astype("float32")
+        threshold = 6.0
+        # The same with TestAbs
+        x[np.abs(x) < 0.005] = 0.02
+        x[np.abs(x - threshold) < 0.005] = threshold + 0.02
+
+        self.inputs = {'X': x}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {
+            'Y': np.minimum(np.maximum(self.inputs['X'], 0), threshold)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
 class TestSoftRelu(OpTest):
     def setUp(self):
         self.op_type = "soft_relu"
         x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
-        threshold = 2
+        threshold = 2.0
         # The same reason with TestAbs
         x[np.abs(x - threshold) < 0.005] = threshold + 0.02
         x[np.abs(x + threshold) < 0.005] = -threshold + 0.02
@@ -144,6 +236,26 @@ class TestSoftRelu(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.02)
 
 
+class TestELU(OpTest):
+    def setUp(self):
+        self.op_type = "elu"
+        x = np.random.uniform(-3, 3, [4, 4]).astype("float32")
+        alpha = 1.
+        # Note: unlike other Relu extensions, point 0 on standard ELU function (i.e. alpha = 1)
+        # is differentiable, so we can skip modifications like x[np.abs(x) < 0.005] = 0.02 here
+        self.inputs = {'X': x}
+        self.attrs = {'alpha': alpha}
+        self.outputs = {
+            'Y': np.maximum(0, x) + np.minimum(0, alpha * (np.exp(x) - 1))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.02)
+
+
 class TestReciprocal(OpTest):
     def setUp(self):
         self.op_type = "reciprocal"
@@ -191,7 +303,7 @@ class TestPow(OpTest):
     def setUp(self):
         self.op_type = "pow"
         self.inputs = {'X': np.random.uniform(1, 2, [11, 17]).astype("float32")}
-        self.attrs = {'factor': 3}
+        self.attrs = {'factor': 3.0}
         self.outputs = {'Y': np.power(self.inputs['X'], 3)}
 
     def test_check_output(self):
@@ -219,5 +331,86 @@ class TestSTanh(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestSoftplus(OpTest):
+    def setUp(self):
+        self.op_type = "softplus"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {'Y': np.log(1 + np.exp(self.inputs['X']))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestSoftsign(OpTest):
+    def setUp(self):
+        self.op_type = "softsign"
+        self.inputs = {
+            'X': np.random.uniform(-1, 1, [11, 17]).astype("float32")
+        }
+        self.outputs = {
+            'Y': np.divide(self.inputs['X'], 1 + np.abs(self.inputs['X']))
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestThresholdedRelu(OpTest):
+    def setUp(self):
+        self.op_type = "thresholded_relu"
+        threshold = 0.25
+        self.relative_error = 0.005
+        X = np.random.uniform(-1, 1, [11, 17]).astype("float32")
+
+        # Same reason as TestAbs
+        X[np.abs(X - threshold) < self.relative_error] = threshold + 0.2
+
+        self.inputs = {'X': X}
+        self.attrs = {'threshold': threshold}
+        self.outputs = {'Y': (X > threshold) * X}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=self.relative_error)
+
+
+class TestHardSigmoid(OpTest):
+    def setUp(self):
+        self.op_type = "hard_sigmoid"
+        self.relative_error = 0.002
+
+        X = np.random.uniform(-5, 5, [2, 2]).astype("float32")
+        slope = 0.2
+        offset = 0.5
+        lower_threshold = -offset / slope
+        upper_threshold = (1 - offset) / slope
+
+        self.inputs = {'X': X}
+        # Same reason as TestAbs
+        X[np.abs(X - lower_threshold) < self.relative_error] = \
+            lower_threshold + 0.2
+        X[np.abs(X - upper_threshold) < self.relative_error] = \
+            upper_threshold - 0.2
+
+        temp = X * slope + offset
+        self.outputs = {'Y': np.maximum(0.0, np.minimum(1.0, temp))}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.002)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adadelta_op.py b/python/paddle/v2/framework/tests/test_adadelta_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..7105593a98aee9885ba16e3ee0649a6024033ee7
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adadelta_op.py
@@ -0,0 +1,96 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdadeltaOp1(OpTest):
+    def setUp(self):
+        self.op_type = "adadelta"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The squared gradient is positive
+        avg_squared_grad = np.random.random((102, 105)).astype("float32")
+        # The squared update is positive
+        avg_squared_update = np.random.random((102, 105)).astype("float32")
+
+        rho = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'AvgSquaredGrad': avg_squared_grad,
+            'AvgSquaredUpdate': avg_squared_update
+        }
+
+        self.attrs = {'rho': rho, 'epsilon': epsilon}
+
+        avg_squared_grad_out = rho * avg_squared_grad + \
+            (1 - rho) * np.square(grad)
+        update = -np.multiply(
+            np.sqrt(
+                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
+                          epsilon)), grad)
+
+        avg_squared_update_out = rho * avg_squared_update + \
+            (1 - rho) * np.square(update)
+
+        param_out = param + update
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'AvgSquaredGradOut': avg_squared_grad_out,
+            'AvgSquaredUpdateOut': avg_squared_update_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdadeltaOp2(OpTest):
+    '''Test Adadelta op with default attribute values
+    '''
+
+    def setUp(self):
+        self.op_type = "adadelta"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The squared gradient is positive
+        avg_squared_grad = np.random.random((102, 105)).astype("float32")
+        # The squared update is positive
+        avg_squared_update = np.random.random((102, 105)).astype("float32")
+
+        rho = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'AvgSquaredGrad': avg_squared_grad,
+            'AvgSquaredUpdate': avg_squared_update
+        }
+
+        avg_squared_grad_out = rho * avg_squared_grad + \
+            (1 - rho) * np.square(grad)
+        update = -np.multiply(
+            np.sqrt(
+                np.divide(avg_squared_update + epsilon, avg_squared_grad_out +
+                          epsilon)), grad)
+
+        avg_squared_update_out = rho * avg_squared_update + \
+            (1 - rho) * np.square(update)
+
+        param_out = param + update
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'AvgSquaredGradOut': avg_squared_grad_out,
+            'AvgSquaredUpdateOut': avg_squared_update_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adagrad_op.py b/python/paddle/v2/framework/tests/test_adagrad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..66bad349e59b608cb3cc965401c81ef4c716b318
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adagrad_op.py
@@ -0,0 +1,69 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdagradOp1(OpTest):
+    ''' Test Adagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdagradOp2(OpTest):
+    ''' Test Adagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon}
+
+        moment_out = moment + grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adam_op.py b/python/paddle/v2/framework/tests/test_adam_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0d6655d4cbcff8ed3d55df0f4e68fc6591fbb11
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adam_op.py
@@ -0,0 +1,180 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdamOp1(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.004
+        beta1 = 0.78
+        beta2 = 0.836
+        epsilon = 1e-4
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, self.attrs)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamOp2(OpTest):
+    def setUp(self):
+        '''Test Adam Op with supplied attributes
+        '''
+        self.op_type = "adam"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+        param_out, moment1_out, \
+            moment2_out = adam_step(self.inputs, attributes)
+
+        self.outputs = {
+            'Moment1Out': moment1_out,
+            'Moment2Out': moment2_out,
+            'ParamOut': param_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adam Operator with supplied attributes
+        '''
+        self.op_type = "adam"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment1 = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The second moment is positive
+        moment2 = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.001
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**10
+        beta2_pow = beta2**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment1': moment1,
+            'Moment2': moment2,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32"),
+            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+        }
+
+        self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment1_out, \
+                moment2_out = adam_step(self.inputs, self.attrs)
+
+            self.outputs = {
+                'Moment1Out': moment1_out,
+                'Moment2Out': moment2_out,
+                'ParamOut': param_out
+            }
+
+            # Verify output for this step
+            self.check_output()
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment1'] = moment1_out
+            self.inputs['Moment2'] = moment2_out
+
+            # Update powers of Beta1 and Beta2 for next time step
+            self.inputs['Beta1Pow'] *= self.attrs['beta1']
+            self.inputs['Beta2Pow'] *= self.attrs['beta1']
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adam_step(inputs, attributes):
+    '''
+    Simulate one step of the adam optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment1, moment2,
+    beta1 power accumulator and beta2 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment1 = inputs['Moment1']
+    moment2 = inputs['Moment2']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+    beta2_pow = inputs['Beta2Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment1_out = beta1 * moment1 + (1 - beta1) * grad
+    moment2_out = beta2 * moment2 + (1 - beta2) * np.square(grad)
+    lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
+    param_out = param - lr_t * (moment1_out / (np.sqrt(moment2_out) + epsilon))
+    return param_out, moment1_out, moment2_out
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_adamax_op.py b/python/paddle/v2/framework/tests/test_adamax_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e5a15aa3d12bbaae99cae6fcb627a336e48f684
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_adamax_op.py
@@ -0,0 +1,172 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestAdamaxOp1(OpTest):
+    def setUp(self):
+        '''Test Adamax Operator with supplied attributes
+        '''
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.78
+        beta2 = 0.899
+        epsilon = 1e-5
+        beta1_pow = beta1**10
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+
+        param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
+                                                          self.attrs)
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'InfNormOut': inf_norm_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamaxOp2(OpTest):
+    '''Test Adamax Operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "adamax"
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.9
+        beta2 = 0.999
+        epsilon = 1e-8
+        beta1_pow = beta1**8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+        param_out, moment_out, inf_norm_out = adamax_step(self.inputs, attrs)
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'InfNormOut': inf_norm_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAdamaxOpMultipleSteps(OpTest):
+    def setUp(self):
+        '''Test Adamax Operator with supplied attributes
+        '''
+        self.op_type = "adamax"
+        self.num_steps = 10
+
+        param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        moment = np.random.uniform(-1, 1, (102, 105)).astype("float32")
+        # The infinity norm is positive
+        inf_norm = np.random.random((102, 105)).astype("float32")
+
+        learning_rate = 0.002
+        beta1 = 0.8
+        beta2 = 0.99
+        epsilon = 1e-5
+        beta1_pow = 1
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'InfNorm': inf_norm,
+            'LearningRate': np.array([learning_rate]).astype("float32"),
+            'Beta1Pow': np.array([beta1_pow]).astype("float32")
+        }
+
+        self.attrs = {'beta1': beta1, 'beta2': beta2, 'epsilon': epsilon}
+
+    def test_check_output(self):
+        for _ in range(self.num_steps):
+            param_out, moment_out, inf_norm_out = adamax_step(self.inputs,
+                                                              self.attrs)
+
+            self.outputs = {
+                'ParamOut': param_out,
+                'MomentOut': moment_out,
+                'InfNormOut': inf_norm_out
+            }
+
+            # Verify output for this step
+            self.check_output()
+
+            # Output of this step becomes input for next step
+            self.inputs['Param'] = param_out
+            self.inputs['Moment'] = moment_out
+            self.inputs['InfNorm'] = inf_norm_out
+
+            # Update Beta1 Power accumulator for next step
+            self.inputs['Beta1Pow'] *= self.attrs['beta1']
+
+            # Randomize gradient for next step
+            self.inputs['Grad'] = np.random.uniform(
+                -1, 1, (102, 105)).astype("float32")
+
+
+def adamax_step(inputs, attributes):
+    '''
+    Simulate one step of the adamax optimizer
+    :param inputs: dict of inputs
+    :param attributes: dict of attributes
+    :return tuple: tuple of output param, moment, inf_norm and
+    beta1 power accumulator
+    '''
+    param = inputs['Param']
+    grad = inputs['Grad']
+    moment = inputs['Moment']
+    inf_norm = inputs['InfNorm']
+    lr = inputs['LearningRate']
+    beta1_pow = inputs['Beta1Pow']
+
+    beta1 = attributes['beta1']
+    beta2 = attributes['beta2']
+    epsilon = attributes['epsilon']
+
+    moment_out = beta1 * moment + (1 - beta1) * grad
+    inf_norm_out = np.maximum(beta2 * inf_norm + epsilon, np.abs(grad))
+    lr_t = (lr / (1 - beta1_pow))
+    param_out = param - lr_t * np.divide(moment_out, inf_norm_out)
+
+    return param_out, moment_out, inf_norm_out
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_add_op.py b/python/paddle/v2/framework/tests/test_add_op.py
deleted file mode 100644
index 3ca34d9b9fc2b7b54cc25ca0e0d1a08a71e37c52..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_add_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "add"
-        self.inputs = {
-            'X': np.random.random((102, 105)).astype("float32"),
-            'Y': np.random.random((102, 105)).astype("float32")
-        }
-        self.outputs = {'Out': self.inputs['X'] + self.inputs['Y']}
-
-    def test_check_output(self):
-        self.check_output()
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_batch_norm_op.py b/python/paddle/v2/framework/tests/test_batch_norm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7b071c24da59c048f221a8130d9c2b8ad674911
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_batch_norm_op.py
@@ -0,0 +1,197 @@
+import unittest
+import numpy as np
+from op_test import OpTest, get_backward_op, grad_var_name
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+
+
+def _reference_training(x, scale, offset, epsilon, data_format):
+    if data_format != "NHWC":
+        raise ValueError("data_format must be NHWC, got %s." % data_format)
+    x_square = x * x
+    x_square_sum = np.sum(x_square, (0, 1, 2))
+    x_sum = np.sum(x, axis=(0, 1, 2))
+    element_count = np.size(x) / int(np.shape(x)[-1])
+    mean = x_sum / element_count
+    var = x_square_sum / element_count - mean * mean
+    normalized = (x - mean) / np.sqrt(var + epsilon)
+    return (normalized * scale + offset), mean, var
+
+
+def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
+    # Use the following formulas to calculate gradients:
+    # grad_scale =
+    #   sum(grad_y * (x - mean)) * rsqrt(var + epsilon)
+    #
+    # grad_offset = sum(output_y)
+    #
+    # grad_x =
+    #   1/N * scale * rsqrt(var + epsilon) * (N * grad_y - sum(grad_y) -
+    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
+    if data_format != "NHWC":
+        raise ValueError("data_format must be NHWC, got %s." % data_format)
+    grad_x = scale * (grad_y - np.mean(
+        grad_y, axis=(0, 1, 2)) - (x - mean) * np.mean(
+            grad_y * (x - mean), axis=(0, 1, 2)) /
+                      (var + epsilon)) / np.sqrt(var + epsilon)
+    grad_scale = np.sum(grad_y * (x - mean) / np.sqrt(var + epsilon),
+                        axis=(0, 1, 2))
+    grad_offset = np.sum(grad_y, axis=(0, 1, 2))
+    return grad_x, grad_scale, grad_offset
+
+
+def create_or_get_tensor(scope, var_name, var, place):
+    tensor = scope.var(var_name).get_tensor()
+    if var is not None:
+        assert isinstance(var, np.ndarray)
+        tensor.set_lod([[]])
+        tensor.set_dims(var.shape)
+        tensor.set(var, place)
+    return tensor
+
+
+def set_output_grad(scope, outputs, place):
+    def __set_tensor__(name):
+        out_tensor = scope.find_var(name).get_tensor()
+        grad_tensor = scope.var(grad_var_name(name)).get_tensor()
+        out_dtype = out_tensor.dtype()
+        if out_dtype == core.DataType.FP64:
+            data = np.ones(out_tensor.shape(), dtype=np.float64)
+        elif out_dtype == core.DataType.FP32:
+            data = np.ones(out_tensor.shape(), dtype=np.float32)
+        else:
+            raise ValueError("Not supported data type " + str(out_dtype))
+
+        grad_tensor.set(data, place)
+
+    for output in outputs:
+        __set_tensor__(output)
+
+
+class TestBatchNormOp(OpTest):
+    def __assert_close(self, tensor, np_array, msg, atol=1e-4):
+        self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
+
+    def test_forward_backward(self):
+        # attr
+        data_format = "NHWC"
+        epsilon = 0.00001
+        momentum = 0.9
+
+        channel_num = 2
+        x_shape = [2, 3, 4, channel_num]
+        scale_shape = [channel_num]
+
+        # input
+        x_val = np.random.random_sample(x_shape).astype(np.float32)
+        scale_val = np.random.random_sample(scale_shape).astype(np.float32)
+        bias_val = np.random.random_sample(scale_shape).astype(np.float32)
+
+        mean = np.zeros(scale_shape).astype(np.float32)
+        variance = np.zeros(scale_shape).astype(np.float32)
+
+        # run forward
+        y_out, saved_mean, var_ref = _reference_training(
+            x_val, scale_val, bias_val, epsilon, data_format)
+
+        # run backward
+        mean_out = saved_mean * (1 - momentum)
+        variance_out = var_ref * (1 - momentum)
+        saved_variance = 1 / np.sqrt(var_ref + epsilon)
+
+        #  for gradient test
+        y_grad = np.ones(x_shape).astype(np.float32)
+        x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
+            x_val, y_grad, scale_val, saved_mean, var_ref, epsilon, data_format)
+
+        def test_with_place(place):
+            scope = core.Scope()
+
+            # create input
+            x_tensor = create_or_get_tensor(scope, "x_val", x_val, place)
+            scale_tensor = create_or_get_tensor(scope, "scale_val", scale_val,
+                                                place)
+            bias_tensor = create_or_get_tensor(scope, "bias_val", bias_val,
+                                               place)
+            mean_tensor = create_or_get_tensor(scope, "mean", mean, place)
+            variance_tensor = create_or_get_tensor(scope, "variance", variance,
+                                                   place)
+
+            # create output
+            y_tensor = create_or_get_tensor(scope, "y_out", None, place)
+            saved_mean_tensor = create_or_get_tensor(scope, "saved_mean", None,
+                                                     place)
+            saved_variance_tensor = create_or_get_tensor(
+                scope, "saved_variance", None, place)
+            mean_out_tensor = mean_tensor
+            variance_out_tensor = variance_tensor
+
+            batch_norm_op = Operator(
+                "batch_norm",
+                # inputs
+                X="x_val",
+                Scale="scale_val",
+                Bias="bias_val",
+                Mean="mean",
+                Variance="variance",
+                # outputs
+                Y="y_out",
+                MeanOut="mean",
+                VarianceOut="variance",
+                SavedMean="saved_mean",
+                SavedVariance="saved_variance",
+                # attrs
+                is_test=False,
+                tensor_format=data_format,
+                momentum=momentum,
+                epsilon=epsilon)
+
+            ctx = core.DeviceContext.create(place)
+            batch_norm_op.run(scope, ctx)
+
+            # check forward result
+            self.__assert_close(y_tensor, y_out, "y_out")
+            self.__assert_close(saved_mean_tensor, saved_mean, "saved_mean")
+            self.__assert_close(saved_variance_tensor, saved_variance,
+                                "saved_variance")
+            self.__assert_close(mean_out_tensor, mean_out, "mean_out")
+            # FIXME(qiao) figure out why with cuDNN variance_out have a higher error rate
+            if isinstance(place, core.GPUPlace):
+                atol = 5e-2
+            else:
+                atol = 1e-4
+            self.__assert_close(variance_out_tensor, variance_out,
+                                "variance_out", atol)
+
+            # run backward
+            batch_norm_op_grad = get_backward_op(scope, batch_norm_op, set())
+            set_output_grad(
+                scope,
+                ["y_out", "mean", "variance", "saved_mean", "saved_variance"],
+                place)
+            batch_norm_op_grad.run(scope, ctx)
+
+            x_grad_tensor = create_or_get_tensor(scope,
+                                                 grad_var_name("x_val"), None,
+                                                 place)
+            scale_grad_tensor = create_or_get_tensor(scope,
+                                                     grad_var_name("scale_val"),
+                                                     None, place)
+            bias_grad_tensor = create_or_get_tensor(scope,
+                                                    grad_var_name("bias_val"),
+                                                    None, place)
+
+            # check gradient output
+            self.__assert_close(x_grad_tensor, x_grad_ref, "x_grad")
+            self.__assert_close(scale_grad_tensor, scale_grad_ref, "scale_grad")
+            self.__assert_close(bias_grad_tensor, bias_grad_ref, "bias_grad")
+
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu() and core.op_support_gpu("batch_norm"):
+            places.append(core.GPUPlace(0))
+        for place in places:
+            test_with_place(place)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_clip_op.py b/python/paddle/v2/framework/tests/test_clip_op.py
index 5df6a494989017bab0416e0af962b2a85db046ba..a7e1bf174408e4139db0435d9f4bb0c885f76705 100644
--- a/python/paddle/v2/framework/tests/test_clip_op.py
+++ b/python/paddle/v2/framework/tests/test_clip_op.py
@@ -37,14 +37,14 @@ class TestCase1(TestClipOp):
     def initTestCase(self):
         self.shape = (8, 16, 8)
         self.max = 0.7
-        self.min = 0
+        self.min = 0.0
 
 
 class TestCase2(TestClipOp):
     def initTestCase(self):
         self.shape = (8, 16)
-        self.max = 1
-        self.min = 0
+        self.max = 1.0
+        self.min = 0.0
 
 
 class TestCase3(TestClipOp):
diff --git a/python/paddle/v2/framework/tests/test_cond_op.py b/python/paddle/v2/framework/tests/test_cond_op.py
index e7a506f2775a3f1edbacceb91e84ad49a9db67c0..2c7bcc4be46683ed9871b888c9dbabf27887be29 100644
--- a/python/paddle/v2/framework/tests/test_cond_op.py
+++ b/python/paddle/v2/framework/tests/test_cond_op.py
@@ -15,7 +15,7 @@ class PySimpleCond(object):
         for i in range(1, 10, 2):
             array[i] = 0
         self.cond = np.array(array)
-        self.x = np.ones(shape=(10, 1))
+        self.x = np.ones(shape=(10, 1)).astype("float32")
 
     def forward(self):
         self.index_t = np.where(self.cond == 1)
@@ -39,7 +39,7 @@ class PySimpleCondTest(unittest.TestCase):
 
 
 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
     tensor.set_dims(shape)
     tensor.set(np_data, core.CPUPlace())
     return tensor
@@ -74,9 +74,9 @@ class TestCondOp(unittest.TestCase):
         create_tensor(self.scope, "X", [10, 1], x_np_data)
         cond_np_data = self.py_cond.cond.astype("int32")
         create_tensor(self.scope, "cond", [10, 1], cond_np_data)
-        self.scope.new_var("SubScopes")
-        self.scope.new_var("IndexTensors")
-        self.scope.new_var("Out")
+        self.scope.var("SubScopes")
+        self.scope.var("IndexTensors")
+        self.scope.var("Out")
 
     def create_cond_op(self):
         self.condop = CondOp(
@@ -112,7 +112,4 @@ class TestCondOp(unittest.TestCase):
 
 
 if __name__ == "__main__":
-    exit(
-        0
-    )  # FIXME(yuyang18): Since infer_shape has been removed, cond op may error
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv2d_op.py b/python/paddle/v2/framework/tests/test_conv2d_op.py
index 118a5fc1cde5f4a908b065d581956e0855d50a52..2fb808944ac97f2bdcb05336a2205346ded65a4d 100644
--- a/python/paddle/v2/framework/tests/test_conv2d_op.py
+++ b/python/paddle/v2/framework/tests/test_conv2d_op.py
@@ -3,70 +3,56 @@ import numpy as np
 from op_test import OpTest
 
 
+def conv2d_forward_naive(input, filter, group, conv_param):
+    in_n, in_c, in_h, in_w = input.shape
+    out_c, f_c, f_h, f_w = filter.shape
+    assert f_c * group == in_c
+    assert np.mod(out_c, group) == 0
+    sub_out_c = out_c / group
+
+    stride, pad = conv_param['stride'], conv_param['pad']
+    out_h = 1 + (in_h + 2 * pad[0] - f_h) / stride[0]
+    out_w = 1 + (in_w + 2 * pad[1] - f_w) / stride[1]
+    out = np.zeros((in_n, out_c, out_h, out_w))
+
+    input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
+                       mode='constant',
+                       constant_values=0)
+    for i in range(out_h):
+        for j in range(out_w):
+            for g in range(group):
+                input_pad_masked = \
+                    input_pad[:, g * f_c:(g + 1) * f_c,
+                    i * stride[0]:i * stride[0] + f_h,
+                    j * stride[1]:j * stride[1] + f_w]
+
+                f_sub = filter[g * sub_out_c:(g + 1) * sub_out_c, :, :, :]
+                for k in range(sub_out_c):
+                    out[:, g * sub_out_c + k, i, j] = \
+                        np.sum(input_pad_masked * f_sub[k, :, :, :],
+                               axis=(1, 2, 3))
+
+    return out
+
+
 class TestConv2dOp(OpTest):
     def setUp(self):
-        self.init_groups()
-        self.op_type = "conv2d"
-        batch_size = 2
-        input_channels = 3
-        input_height = 5
-        input_width = 5
-        output_channels = 6
-        filter_height = 3
-        filter_width = 3
-        stride = 1
-        padding = 0
-        output_height = (input_height - filter_height + 2 * padding
-                         ) / stride + 1
-        output_width = (input_width - filter_width + 2 * padding) / stride + 1
-        input = np.random.random((batch_size, input_channels, input_height,
-                                  input_width)).astype("float32")
-
-        filter = np.random.random(
-            (output_channels, input_channels / self.groups, filter_height,
-             filter_width)).astype("float32")
-        output = np.ndarray(
-            (batch_size, output_channels, output_height, output_width))
+        self.init_op_type()
+        self.init_group()
+        self.init_test_case()
+
+        conv2d_param = {'stride': self.stride, 'pad': self.pad}
+        input = np.random.random(self.input_size).astype("float32")
+        filter = np.random.random(self.filter_size).astype("float32")
+        output = conv2d_forward_naive(input, filter, self.groups, conv2d_param)
 
         self.inputs = {'Input': input, 'Filter': filter}
         self.attrs = {
-            'strides': [1, 1],
-            'paddings': [0, 0],
-            'groups': self.groups
+            'strides': self.stride,
+            'paddings': self.pad,
+            'groups': self.groups,
+            'dilations': self.dilations
         }
-
-        output_group_channels = output_channels / self.groups
-        input_group_channels = input_channels / self.groups
-        for batchid in xrange(batch_size):
-            for group in xrange(self.groups):
-                for outchannelid in range(group * output_group_channels,
-                                          (group + 1) * output_group_channels):
-                    for rowid in xrange(output_height):
-                        for colid in xrange(output_width):
-                            start_h = (rowid * stride) - padding
-                            start_w = (colid * stride) - padding
-                            output_value = 0.0
-                            for inchannelid in range(
-                                    group * input_group_channels,
-                                (group + 1) * input_group_channels):
-                                for frowid in xrange(filter_height):
-                                    for fcolid in xrange(filter_width):
-                                        input_value = 0.0
-                                        inrowid = start_h + frowid
-                                        incolid = start_w + fcolid
-                                        if ((inrowid >= 0 and
-                                             inrowid < input_height) and
-                                            (incolid >= 0 and
-                                             incolid < input_width)):
-                                            input_value = input[batchid][
-                                                inchannelid][inrowid][incolid]
-                                        filter_value = filter[outchannelid][
-                                            inchannelid % input_group_channels][
-                                                frowid][fcolid]
-                                        output_value += input_value * filter_value
-                            output[batchid][outchannelid][rowid][
-                                colid] = output_value
-
         self.outputs = {'Output': output}
 
     def test_check_output(self):
@@ -90,14 +76,47 @@ class TestConv2dOp(OpTest):
             max_relative_error=0.05,
             no_grad_set=set(['Input']))
 
-    def init_groups(self):
+    def init_test_case(self):
+        # self.groups = 1
+        # self.op_type = "conv2d"
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] / self.groups
+        self.filter_size = [6, f_c, 3, 3]
+
+    def init_group(self):
         self.groups = 1
 
+    def init_op_type(self):
+        self.op_type = "conv2d"
+
 
 class TestWithGroup(TestConv2dOp):
-    def init_groups(self):
+    def init_group(self):
         self.groups = 3
 
+    def init_op_type(self):
+        self.op_type = "conv2d"
+
+
+class TestCudnn(TestConv2dOp):
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv_cudnn"
+
+
+class TestCudnnWithGroup(TestConv2dOp):
+    def init_group(self):
+        self.groups = 3
+
+    def init_op_type(self):
+        self.op_type = "conv_cudnn"
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..71ca262f00378381d2d65e87d198d6b1755e9a2b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_conv2dtranspose_op.py
@@ -0,0 +1,102 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv2dtranspose_forward_naive(input_, filter_, conv2dtranspose_param):
+    # [2, 3, 5, 5]
+    in_n, in_c, in_h, in_w = input_.shape
+    # [3, 6, 3, 3]
+    f_c, out_c, f_h, f_w = filter_.shape
+    assert in_c == f_c
+
+    stride, pad = conv2dtranspose_param['stride'], conv2dtranspose_param['pad']
+    out_h = (in_h - 1) * stride[0] + f_h
+    out_w = (in_w - 1) * stride[1] + f_w
+
+    out = np.zeros((in_n, out_c, out_h, out_w))
+
+    for n in range(in_n):
+        for i in range(in_h):
+            for j in range(in_w):
+                input_masked = input_[n, :, i, j]  # (c)
+                input_masked = np.reshape(input_masked, (in_c, 1, 1))
+                input_masked = np.tile(input_masked, (1, f_h, f_w))
+
+                for k in range(out_c):
+                    tmp_out = np.sum(input_masked * filter_[:, k, :, :], axis=0)
+                    i1, i2 = i * stride[0], i * stride[0] + f_h
+                    j1, j2 = j * stride[0], j * stride[0] + f_w
+                    out[n, k, i1:i2, j1:j2] += tmp_out
+
+    return out
+
+
+class TestConv2dTransposeOp(OpTest):
+    def setUp(self):
+        # init as conv transpose
+        self.init_op_type()
+
+        # [2, 3, 5, 5] -> kernel [3, 6, 3, 3] -> output [2, 6, 7, 7]
+        self.init_test_case()
+
+        conv2dtranspose_param = {'stride': self.stride, 'pad': self.pad}
+        input_ = np.random.random(self.input_size).astype("float32")
+        filter_ = np.random.random(self.filter_size).astype("float32")
+        output = conv2dtranspose_forward_naive(input_, filter_,
+                                               conv2dtranspose_param)
+        # print 'deconv output py', output, output.shape
+
+        self.inputs = {'Input': input_, 'Filter': filter_}
+        self.attrs = {
+            'strides': self.stride,
+            'paddings': self.pad,
+            # 'dilations': self.dilations
+        }
+        self.outputs = {'Output': output}
+
+    def test_check_output(self):
+        print 'check output here'
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            set(['Input', 'Filter']), 'Output', max_relative_error=0.05)
+
+    def test_check_grad_no_filter(self):
+        self.check_grad(
+            ['Input'],
+            'Output',
+            max_relative_error=0.05,
+            no_grad_set=set(['Filter']))
+
+    def test_check_grad_no_input(self):
+        self.check_grad(
+            ['Filter'],
+            'Output',
+            max_relative_error=0.05,
+            no_grad_set=set(['Input']))
+
+    def init_test_case(self):
+        self.pad = [0, 0]
+        self.stride = [1, 1]
+        self.dilations = [1, 1]
+        self.input_size = [2, 3, 5, 5]  # NCHW
+        f_c = self.input_size[1]
+        self.filter_size = [f_c, 6, 3, 3]
+
+    def init_op_type(self):
+        self.op_type = "conv2dtranspose"
+
+
+"""
+class TestCudnn(TestConv2dOp):
+    def init_group(self):
+        self.groups = 1
+
+    def init_op_type(self):
+        self.op_type = "conv_cudnn"
+"""
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_conv_shift_op.py b/python/paddle/v2/framework/tests/test_conv_shift_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9ab21a06a1c6e8e2d1e936a0b4b8a07a59f57b9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_conv_shift_op.py
@@ -0,0 +1,47 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def conv_shift_forward(x, y):
+    out = np.zeros_like(x)
+    M = x.shape[1]
+    N = y.shape[1]
+    y_half_width = (N - 1) / 2
+    for i in xrange(M):
+        for j in xrange(N):
+            out[:, i] += x[:, (i + j + M - y_half_width) % M] * y[:, j]
+    return out
+
+
+class TestConvShiftOp(OpTest):
+    def setUp(self):
+        self.op_type = "conv_shift"
+
+        batch_size = 4
+        x_dim = 17
+        y_dim = 3  # must be odd and <= x_dim
+        x = np.random.random((batch_size, x_dim)).astype("float32")
+        y = np.random.random((batch_size, y_dim)).astype("float32")
+        self.inputs = {'X': x, 'Y': y}
+
+        out = conv_shift_forward(x, y)
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.05)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.05, no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.05, no_grad_set=set('Y'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
index 1de514dff487158e0823fd628d9b3b50f36fdd9b..e1c45c2674ee9cc7c7240bdd67de05cb218ac287 100644
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -21,7 +21,7 @@ class TestCrossEntropyOp1(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"softLabel": False}
+        self.attrs = {"soft_label": False}
 
     def test_check_output(self):
         self.check_output()
@@ -49,7 +49,7 @@ class TestCrossEntropyOp2(OpTest):
 
         self.inputs = {"X": X, "Label": label}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"softLabel": True}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
@@ -80,9 +80,9 @@ class TestCrossEntropyOp3(OpTest):
         cross_entropy2 = (-label * np.log(X)).sum(
             axis=1, keepdims=True).astype("float32")
 
-        self.inputs = {"X": X, "Label": label}
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
         self.outputs = {"Y": cross_entropy}
-        self.attrs = {"softLabel": True}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..674c3fda5c82309bbfbbad936a8b0b26929d42d9
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_decayed_adagrad_op.py
@@ -0,0 +1,71 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestDecayedAdagradOp1(OpTest):
+    ''' Test DecayedAdagrad operator with explicit attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.80
+        epsilon = 1e-8
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestDecayedAdagradOp2(OpTest):
+    ''' Test DecayedAdagrad operator with default attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "decayed_adagrad"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+        lr = 0.01
+        decay = 0.95
+        epsilon = 1e-6
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Moment': moment,
+            'LearningRate': np.array([lr]).astype("float32")
+        }
+
+        self.attrs = {'decay': decay, 'epsilon': epsilon}
+
+        moment_out = decay * moment + (1 - decay) * grad * grad
+        param_out = param - lr * grad / (np.sqrt(moment_out) + epsilon)
+
+        self.outputs = {'ParamOut': param_out, 'MomentOut': moment_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
index 495863c4562b5a2d6755fb02e21a6b0c845fd7b6..09a9850d054e3d7e6bf6db363fc577bdff8e9f43 100644
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -10,7 +10,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
         self.assertIsNone(find_var("test"))
 
     def test_create_var_get_var(self):
-        var_a = new_var("var_a")
+        var_a = var("var_a")
         self.assertIsNotNone(var_a)
         self.assertIsNotNone(get_cur_scope().find_var('var_a'))
         enter_local_scope()
@@ -19,7 +19,7 @@ class TestDefaultScopeFuncs(unittest.TestCase):
 
     def test_var_get_int(self):
         def __new_scope__():
-            i = new_var("var_i")
+            i = var("var_i")
             self.assertFalse(i.is_int())
             i.set_int(10)
             self.assertTrue(i.is_int())
diff --git a/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..fa2ccd0c3b74a2ee8b8fd9eb8986cb79ff07c98e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_dynamic_recurrent_op.py
@@ -0,0 +1,168 @@
+import logging
+import paddle.v2.framework.core as core
+import unittest
+from paddle.v2.framework.op import Operator, DynamicRecurrentOp
+import numpy as np
+
+# for siplicity, just one level LoD
+lod_py = [[0, 4, 7, 9, 10]]
+input_dim = 30
+num_sents = len(lod_py[0]) - 1
+weight_dim = 15
+
+
+def create_tensor(scope, name, shape, np_data):
+    tensor = scope.var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.set(np_data, core.CPUPlace())
+    return tensor
+
+
+class PyRNNStep(object):
+    def __init__(self):
+
+        self.x = np.random.normal(size=(lod_py[0][-1],
+                                        input_dim)).astype("float32")
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.random.normal(size=(num_sents,
+                                             input_dim)).astype("float32")
+
+
+class DynamicRecurrentOpTest(unittest.TestCase):
+    '''
+    Test RNNOp
+
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    states:
+        - h
+    outputs:
+       - h
+    '''
+
+    py = PyRNNStep()
+
+    def forward(self):
+        self.scope = core.Scope()
+        self.create_global_variables()
+        self.create_rnn_op()
+        self.create_step_net()
+        ctx = core.DeviceContext.create(core.CPUPlace())
+        self.rnnop.run(self.scope, ctx)
+        state = self.rnnop.get_state("h@state")
+        print 'state size: ', state.size()
+
+        step_inputs = self.rnnop.get_step_input("x")
+        print "x size ", step_inputs.size()
+        for i in range(step_inputs.size()):
+            print "x %d" % i, np.array(step_inputs.read(i).get_dims())
+        step_outputs = self.rnnop.get_step_output('h@state')
+        print 'step_outputs.size ', step_outputs.size()
+        output = self.scope.find_var("h@state").get_tensor()
+        print 'output', np.array(output).shape
+
+    def create_global_variables(self):
+        # create inlink
+        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
+                                 self.py.x)
+        x_tensor.set_lod(lod_py)
+        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
+        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
+        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
+                      self.py.h_boot)
+        self.scope.var("step_scopes")
+        self.scope.var("h@state")
+
+    def create_rnn_op(self):
+        # create RNNOp
+        self.rnnop = DynamicRecurrentOp(
+            # inputs
+            inputs=["x"],
+            initial_states=["h_boot"],
+            step_net="step_unit",
+            # outputs
+            outputs=["h@state"],
+            step_scopes="step_scopes",
+            # attributes
+            ex_states=["h@pre"],
+            states=["h@state"])
+
+    def create_step_net(self):
+        step_unit = core.Net.create()
+        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@state")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            step_unit.append_op(op)
+        step_unit.complete_add_op(True)
+        self.rnnop.set_step_unit(step_unit)
+
+    def test_forward(self):
+        print 'test recurrent op forward'
+        pd_output = self.forward()
+        print 'pd_output', pd_output
+
+
+class RecurrentGradientOpTest(unittest.TestCase):
+    py = PyRNNStep()
+
+    def create_forward_op(self):
+        # create RNNOp
+        self.forward_op = DynamicRecurrentOp(
+            # inputs
+            inputs=["x"],
+            initial_states=["h_boot"],
+            step_net="step_unit",
+            # outputs
+            outputs=["h@state"],
+            step_scopes="step_scopes",
+            # attributes
+            ex_states=["h@pre"],
+            states=["h@state"])
+
+    def create_gradient_op(self):
+        a = set()
+        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
+
+    def create_step_net(self):
+        step_unit = core.Net.create()
+        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
+        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
+        sig_op = Operator("sigmoid", X="sum", Y="h@state")
+
+        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
+            step_unit.append_op(op)
+        step_unit.complete_add_op(True)
+        self.forward_op.set_step_unit(step_unit)
+
+    def create_global_variables(self):
+        # create inlink
+        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
+                                 self.py.x)
+        x_tensor.set_lod(lod_py)
+        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
+        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
+        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
+                      self.py.h_boot)
+        self.scope.var("step_scopes")
+        self.scope.var("h@state")
+
+    def test_grad(self):
+        self.scope = core.Scope()
+        self.create_forward_op()
+        self.create_global_variables()
+        self.create_step_net()
+        self.create_gradient_op()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_add_op.py b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
index f3101a709b8bcf58e8682ab3d0ca5217a7f3572d..57daddd5698f77527bc5b78c436065a851867ae0 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_add_op.py
@@ -92,5 +92,33 @@ class TestElementwiseAddOp_broadcast_3(TestElementwiseOp):
         }
 
 
+class TestElementwiseAddOp_rowwise_add_0(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 3, 4).astype(np.float32),
+            'Y': np.random.rand(3, 4).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 3, 4)
+        }
+
+
+class TestElementwiseAddOp_rowwise_add_1(TestElementwiseOp):
+    def setUp(self):
+        self.op_type = "elementwise_add"
+        self.inputs = {
+            'X': np.random.rand(2, 1).astype(np.float32),
+            'Y': np.random.rand(1).astype(np.float32)
+        }
+
+        self.attrs = {'axis': 1}
+        self.outputs = {
+            'Out': self.inputs['X'] + self.inputs['Y'].reshape(1, 1)
+        }
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
index cee4385a8176f7a441a280e3cd40c39ca51493c5..261ca9cb3da90dee91b016fee98f67b4c19356a1 100644
--- a/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_elementwise_mul_op.py
@@ -7,8 +7,8 @@ class ElementwiseMulOp(OpTest):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float32"),
-            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float32")
+            'X': np.random.uniform(0.1, 1, [13, 17]).astype("float64"),
+            'Y': np.random.uniform(0.1, 1, [13, 17]).astype("float64")
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
@@ -16,23 +16,21 @@ class ElementwiseMulOp(OpTest):
         self.check_output()
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.1)
+        self.check_grad(['X', 'Y'], 'Out')
 
     def test_check_grad_ingore_x(self):
-        self.check_grad(
-            ['Y'], 'Out', max_relative_error=0.1, no_grad_set=set("X"))
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
 
     def test_check_grad_ingore_y(self):
-        self.check_grad(
-            ['X'], 'Out', max_relative_error=0.1, no_grad_set=set('Y'))
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
 
 
 class TestElementwiseMulOp_Vector(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.random((32, )).astype("float32"),
-            'Y': np.random.random((32, )).astype("float32")
+            'X': np.random.random((32, )).astype("float64"),
+            'Y': np.random.random((32, )).astype("float64")
         }
         self.outputs = {'Out': np.multiply(self.inputs['X'], self.inputs['Y'])}
 
@@ -41,8 +39,8 @@ class TestElementwiseMulOp_broadcast_0(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(2).astype(np.float32)
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(2).astype(np.float64)
         }
 
         self.attrs = {'axis': 0}
@@ -55,8 +53,8 @@ class TestElementwiseMulOp_broadcast_1(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(3).astype(np.float32)
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(3).astype(np.float64)
         }
 
         self.attrs = {'axis': 1}
@@ -69,8 +67,8 @@ class TestElementwiseMulOp_broadcast_2(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4).astype(np.float32),
-            'Y': np.random.rand(4).astype(np.float32)
+            'X': np.random.rand(2, 3, 4).astype(np.float64),
+            'Y': np.random.rand(4).astype(np.float64)
         }
 
         self.outputs = {
@@ -82,8 +80,8 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
     def setUp(self):
         self.op_type = "elementwise_mul"
         self.inputs = {
-            'X': np.random.rand(2, 3, 4, 5).astype(np.float32),
-            'Y': np.random.rand(3, 4).astype(np.float32)
+            'X': np.random.rand(2, 3, 4, 5).astype(np.float64),
+            'Y': np.random.rand(3, 4).astype(np.float64)
         }
 
         self.attrs = {'axis': 1}
diff --git a/python/paddle/v2/framework/tests/test_executor_and_mul.py b/python/paddle/v2/framework/tests/test_executor_and_mul.py
new file mode 100644
index 0000000000000000000000000000000000000000..35f775711167ce0d210044ab4cb382db802f39a5
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_executor_and_mul.py
@@ -0,0 +1,36 @@
+import unittest
+from paddle.v2.framework.layers import mul, data
+import paddle.v2.framework.core as core
+from paddle.v2.framework.executor import Executor
+from paddle.v2.framework.framework import g_program
+import numpy
+
+
+class TestExecutor(unittest.TestCase):
+    def test_mul(self):
+        a = data(name='a', shape=[784], data_type='float32')
+        b = data(
+            name='b',
+            shape=[784, 100],
+            data_type='float32',
+            append_batch_size=False)
+        out = mul(x=a, y=b)
+        place = core.CPUPlace()
+        a_np = numpy.random.random((100, 784)).astype('float32')
+        tensor_a = core.LoDTensor()
+        tensor_a.set(a_np, place)
+        b_np = numpy.random.random((784, 100)).astype('float32')
+        tensor_b = core.LoDTensor()
+        tensor_b.set(b_np, place)
+        exe = Executor(place)
+        outs = exe.run(g_program,
+                       feed={'a': tensor_a,
+                             'b': tensor_b},
+                       fetch_list=[out])
+        out = numpy.array(outs[0])
+        self.assertEqual((100, 100), out.shape)
+        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py
deleted file mode 100644
index 9f56fe5049c66aa5fce40ce815105e7871ebc3b2..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestFCOp1(OpTest):
-    def setUp(self):
-        x0 = np.random.random((16, 32)).astype("float32")
-        w0 = np.random.random((32, 10)).astype("float32")
-
-        mul_out0 = np.dot(x0, w0)
-        identity_out = mul_out0
-
-        self.op_type = "fc"
-        self.inputs = {"X": [("X0", x0)], "W": [("W0", w0)]}
-        self.outputs = {"MulOut": [("MulOut0", mul_out0)], "Out": identity_out}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(["X0", "W0"], "Out", max_relative_error=0.01)
-
-
-class TestFCOp2(OpTest):
-    def setUp(self):
-        x0 = np.random.random((16, 4, 8)).astype("float32")
-        x1 = np.random.random((4, 4, 32)).astype("float32")
-        w0 = np.random.random((32, 10)).astype("float32")
-        w1 = np.random.random((32, 10)).astype("float32")
-        b = np.random.random(10).astype("float32")
-
-        mul_out0 = np.dot(x0.reshape(16, 4 * 8), w0)
-        mul_out1 = np.dot(x1.reshape(4 * 4, 32), w1)
-        sum_out = mul_out0 + mul_out1
-        add_out = np.add(sum_out, b)
-        sigmoid_out = 1 / (1 + np.exp(-add_out))
-
-        self.op_type = "fc"
-        self.inputs = {
-            "X": [("X0", x0), ("X1", x1)],
-            "W": [("W0", w0), ("W1", w1)],
-            "B": b
-        }
-        self.attrs = {"xNumColDims": [1, 2], "activation": "sigmoid"}
-        self.outputs = {
-            "MulOut": [("MulOut0", mul_out0), ("MulOut1", mul_out1)],
-            "SumOut": sum_out,
-            "AddOut": add_out,
-            "Out": sigmoid_out
-        }
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(
-            ["X0", "X1", "W0", "W1", "B"], "Out", max_relative_error=0.01)
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_feed_fetch_method.py b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbd659ece0188140e197982ea818d7c3897daf4e
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_feed_fetch_method.py
@@ -0,0 +1,31 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+
+
+class TestFeedFetch(unittest.TestCase):
+    def test_feed_fetch(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        input_array = np.ones((4, 4, 6)).astype("float32")
+        input_array[0, 0, 0] = 3
+        input_array[3, 3, 5] = 10
+        input_tensor = core.LoDTensor([[0, 2, 4]])
+        input_tensor.set(input_array, place)
+
+        core.set_feed_variable(scope, input_tensor, "feed", 0)
+
+        output_tensor = core.get_fetch_variable(scope, "feed", 0)
+
+        output_lod = output_tensor.lod()
+        self.assertEqual(0, output_lod[0][0])
+        self.assertEqual(2, output_lod[0][1])
+        self.assertEqual(4, output_lod[0][2])
+
+        output_array = np.array(output_tensor)
+        self.assertEqual(3, output_array[0, 0, 0])
+        self.assertEqual(10, output_array[3, 3, 5])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fill_constant_op.py b/python/paddle/v2/framework/tests/test_fill_constant_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..dff7b615aa378b0ef932df47241db07eace61a86
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_fill_constant_op.py
@@ -0,0 +1,35 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestFillConstantOp1(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with specified value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92], 'value': 3.8}
+        self.outputs = {'Out': np.full((123, 92), 3.8)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestFillConstantOp2(OpTest):
+    def setUp(self):
+        '''Test fill_constant op with default value
+        '''
+        self.op_type = "fill_constant"
+
+        self.inputs = {}
+        self.attrs = {'shape': [123, 92]}
+        self.outputs = {'Out': np.full((123, 92), 0.0)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fit_a_line.py b/python/paddle/v2/framework/tests/test_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..b20e3357894c2bacad83f0a99632710c586602de
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_fit_a_line.py
@@ -0,0 +1,73 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+x = layers.data(
+    name='x',
+    shape=[13],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+
+y_predict = layers.fc(input=x,
+                      size=1,
+                      act=None,
+                      program=program,
+                      init_program=init_program)
+
+y = layers.data(
+    name='y',
+    shape=[1],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+
+cost = layers.square_error_cost(
+    input=y_predict, label=y, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
+
+        tensor_x = core.LoDTensor()
+        tensor_x.set(x_data, place)
+        # print tensor_x.get_dims()
+
+        tensor_y = core.LoDTensor()
+        tensor_y.set(y_data, place)
+        # print tensor_y.get_dims()
+        outs = exe.run(program,
+                       feed={'x': tensor_x,
+                             'y': tensor_y},
+                       fetch_list=[avg_cost])
+        out = np.array(outs[0])
+
+        if out[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/framework/tests/test_gaussian_random_op.py b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
index cff5080048bbd34782e52d8b2b7690176f996c99..8b7779667d5e806c06b333527f774c7987ce7e73 100644
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
@@ -14,7 +14,7 @@ class TestGaussianRandomOp(unittest.TestCase):
 
     def gaussian_random_test(self, place):
         scope = core.Scope()
-        scope.new_var('Out').get_tensor()
+        scope.var('Out').get_tensor()
 
         op = Operator(
             "gaussian_random",
diff --git a/python/paddle/v2/framework/tests/test_gradient_checker.py b/python/paddle/v2/framework/tests/test_gradient_checker.py
deleted file mode 100644
index 85117bf9600975ea5d61dfb5b34335792bf6d8b2..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_gradient_checker.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import unittest
-import numpy as np
-import paddle.v2.framework.core as core
-from op_test import get_numeric_gradient
-from op_test import create_op
-
-
-class GetNumericGradientTest(unittest.TestCase):
-    def test_add_op(self):
-        x = np.random.random((10, 1)).astype("float32")
-        y = np.random.random((10, 1)).astype("float32")
-        z = x + y
-        scope = core.Scope()
-        add_op = create_op(scope, "add", {'X': x, 'Y': y}, {'Out': z}, dict())
-        arr = get_numeric_gradient(scope, add_op, {'X': x,
-                                                   'Y': y}, 'X', ['Out'])
-        self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-4)
-
-    def test_softmax_op(self):
-        def stable_softmax(x):
-            """Compute the softmax of vector x in a numerically stable way."""
-            shiftx = x - np.max(x)
-            exps = np.exp(shiftx)
-            return exps / np.sum(exps)
-
-        def label_softmax_grad(Y, dY):
-            dX = Y * 0.0
-            for i in range(Y.shape[0]):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return dX
-
-        X = np.random.random((2, 2)).astype("float32")
-        Y = np.apply_along_axis(stable_softmax, 1, X)
-        dY = np.ones(Y.shape)
-        dX = label_softmax_grad(Y, dY)
-
-        scope = core.Scope()
-        softmax_op = create_op(scope, "softmax", {"X": X}, {"Y": Y}, dict())
-
-        arr = get_numeric_gradient(scope, softmax_op, {"X": X}, "X", "Y")
-        np.testing.assert_almost_equal(arr, dX, decimal=1e-2)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_gru_unit_op.py b/python/paddle/v2/framework/tests/test_gru_unit_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..57625362d21905d257f46ff5330841a20438773a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_gru_unit_op.py
@@ -0,0 +1,115 @@
+import math
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class GRUActivationType(OpTest):
+    identity = 0
+    sigmoid = 1
+    tanh = 2
+    relu = 3
+
+
+def identity(x):
+    return x
+
+
+def sigmoid(x):
+    return 1. / (1. + np.exp(-x))
+
+
+def tanh(x):
+    return 2. * sigmoid(2. * x) - 1.
+
+
+def relu(x):
+    return np.maximum(x, 0)
+
+
+class TestGRUUnitOp(OpTest):
+    batch_size = 3
+    frame_size = 5
+    activate = {
+        GRUActivationType.identity: identity,
+        GRUActivationType.sigmoid: sigmoid,
+        GRUActivationType.tanh: tanh,
+        GRUActivationType.relu: relu,
+    }
+
+    def set_inputs(self):
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        self.op_type = 'gru_unit'
+        self.inputs = {
+            'Input': np.random.uniform(
+                -0.1, 0.1, (batch_size, frame_size * 3)).astype('float32'),
+            'HiddenPrev': np.random.uniform(
+                -0.1, 0.1, (batch_size, frame_size)).astype('float32'),
+            'Weight': np.random.uniform(
+                -1. / math.sqrt(frame_size), 1. / math.sqrt(frame_size),
+                (frame_size, frame_size * 3)).astype('float32'),
+        }
+        self.attrs = {
+            'activation': GRUActivationType.tanh,
+            'gate_activation': GRUActivationType.sigmoid
+        }
+
+    def set_outputs(self):
+        # GRU calculations
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        x = self.inputs['Input']
+        h_p = self.inputs['HiddenPrev']
+        w = self.inputs['Weight']
+        b = self.inputs['Bias'] if self.inputs.has_key('Bias') else np.zeros(
+            (1, frame_size * 3))
+        g = x + np.tile(b, (batch_size, 1))
+        w_u_r = w.flatten()[:frame_size * frame_size * 2].reshape(
+            (frame_size, frame_size * 2))
+        u_r = self.activate[self.attrs['gate_activation']](np.dot(
+            h_p, w_u_r) + g[:, :frame_size * 2])
+        u = u_r[:, :frame_size]
+        r = u_r[:, frame_size:frame_size * 2]
+        r_h_p = r * h_p
+        w_c = w.flatten()[frame_size * frame_size * 2:].reshape(
+            (frame_size, frame_size))
+        c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
+                                                    g[:, frame_size * 2:])
+        g = np.hstack((u_r, c))
+        h = u * h_p + (1 - u) * c
+        self.outputs = {'Gate': g, 'ResetHiddenPrev': r_h_p, 'Hidden': h}
+
+    def setUp(self):
+        self.set_inputs()
+        self.set_outputs()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['Input', 'HiddenPrev', 'Weight'], ['Hidden'],
+            max_relative_error=0.007)
+
+
+class TestGRUUnitOpWithBias(TestGRUUnitOp):
+    def set_inputs(self):
+        batch_size = self.batch_size
+        frame_size = self.frame_size
+        super(TestGRUUnitOpWithBias, self).set_inputs()
+        self.inputs['Bias'] = np.random.uniform(
+            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+        self.attrs = {
+            'activation': GRUActivationType.identity,
+            'gate_activation': GRUActivationType.sigmoid
+        }
+
+    def test_check_grad(self):
+        self.check_grad(
+            ['Input', 'HiddenPrev', 'Weight', 'Bias'], ['Hidden'],
+            max_relative_error=0.007)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_identity_op.py b/python/paddle/v2/framework/tests/test_identity_op.py
deleted file mode 100644
index 26cec1fcc3ad003281c9c41571d475b55bd30026..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_identity_op.py
+++ /dev/null
@@ -1,20 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestIdentityOp(OpTest):
-    def setUp(self):
-        self.op_type = "identity"
-        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
-        self.outputs = {'Y': self.inputs['X']}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad(self):
-        self.check_grad(['X'], 'Y')
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_increment_op.py b/python/paddle/v2/framework/tests/test_increment_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e174272b05b9413cc2bc1e099c4dd17899829e76
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_increment_op.py
@@ -0,0 +1,41 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestIncrementOpPositiveStep(OpTest):
+    """Test increment op with positive step
+    """
+
+    def setUp(self):
+        self.op_type = "increment"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.attrs = {'step': 14.8}
+        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestIncrementOpNegativeStep(OpTest):
+    """Test increment op with negative step
+    """
+
+    def setUp(self):
+        self.op_type = "increment"
+        self.inputs = {'X': np.random.random((10, 10)).astype("float32")}
+        self.attrs = {'step': -3.8}
+        self.outputs = {'Out': self.inputs['X'] + self.attrs['step']}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_infer_shape.py b/python/paddle/v2/framework/tests/test_infer_shape.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cfb9e6687f733353cfdbfbd1ad830c2bed8463b
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_infer_shape.py
@@ -0,0 +1,69 @@
+import unittest
+
+import paddle.v2.framework.core as core
+
+
+class TestInferShape(unittest.TestCase):
+    def test_sum_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        shape = [10, 20]
+
+        # prepare input/output
+        x1 = block.var("x1")
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(shape)
+        x2 = block.var("x2")
+        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x2.set_shape(shape)
+
+        out = block.var("out")
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
+
+        # prepare the operator
+        sum_op_desc = block.append_op()
+        sum_op_desc.set_type("sum")
+        sum_op_desc.set_input("X", ["x1", "x2"])
+        sum_op_desc.set_output("Out", ["out"])
+
+        sum_op_desc.infer_shape(block)
+        self.assertEqual(out.shape(), shape)
+
+    def test_mul_op(self):
+        prog = core.ProgramDesc()
+        self.assertIsNotNone(prog)
+        block = prog.block(0)
+        self.assertIsNotNone(block)
+
+        x_shape = [10, 20]
+        y_shape = [20, 30]
+
+        # prepare input/output
+        x1 = block.var("x")
+        x1.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x1.set_shape(x_shape)
+        x2 = block.var("y")
+        x2.set_type(core.VarDesc.VarType.LOD_TENSOR)
+        x2.set_shape(y_shape)
+
+        out = block.var("out")
+        out.set_type(core.VarDesc.VarType.LOD_TENSOR)
+
+        # prepare the operator
+        mul_op_desc = block.append_op()
+        mul_op_desc.set_type("mul")
+        mul_op_desc.set_input("X", ["x"])
+        mul_op_desc.set_input("Y", ["y"])
+        mul_op_desc.set_output("Out", ["out"])
+        mul_op_desc.set_attr("x_num_col_dims", 1)
+        mul_op_desc.set_attr("y_num_col_dims", 1)
+
+        mul_op_desc.infer_shape(block)
+        self.assertEqual(out.shape(), [x_shape[0], y_shape[1]])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_layers.py b/python/paddle/v2/framework/tests/test_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7aedb985f98f2d8953e0968d19ece9c70d792246
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_layers.py
@@ -0,0 +1,164 @@
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.nets as nets
+from paddle.v2.framework.framework import Program, g_program
+import paddle.v2.framework.core as core
+import unittest
+
+
+class TestBook(unittest.TestCase):
+    def test_fit_a_line(self):
+        program = Program()
+        x = layers.data(
+            name='x', shape=[13], data_type='float32', program=program)
+        y_predict = layers.fc(input=x, size=1, act=None, program=program)
+
+        y = layers.data(
+            name='y', shape=[1], data_type='float32', program=program)
+        cost = layers.square_error_cost(
+            input=y_predict, label=y, program=program)
+
+        avg_cost = layers.mean(x=cost, program=program)
+        self.assertIsNotNone(avg_cost)
+        program.append_backward(avg_cost)
+        print str(program)
+
+    def test_recognize_digits_mlp(self):
+        program = Program()
+
+        # Change g_program, so the rest layers use `g_program`
+        images = layers.data(
+            name='pixel', shape=[784], data_type='float32', program=program)
+        label = layers.data(
+            name='label', shape=[1], data_type='int32', program=program)
+        hidden1 = layers.fc(input=images, size=128, act='relu', program=program)
+        hidden2 = layers.fc(input=hidden1, size=64, act='relu', program=program)
+        predict = layers.fc(input=hidden2,
+                            size=10,
+                            act='softmax',
+                            program=program)
+        cost = layers.cross_entropy(input=predict, label=label, program=program)
+        avg_cost = layers.mean(x=cost, program=program)
+        self.assertIsNotNone(avg_cost)
+        print str(program)
+
+    def test_simple_conv2d(self):
+        program = Program()
+        images = layers.data(
+            name='pixel', shape=[3, 48, 48], data_type='int32', program=program)
+        layers.conv2d(
+            input=images, num_filters=3, filter_size=[4, 4], program=program)
+
+        print str(program)
+
+    def test_recognize_digits_conv(self):
+        program = Program()
+
+        images = layers.data(
+            name='pixel',
+            shape=[1, 28, 28],
+            data_type='float32',
+            program=program)
+        label = layers.data(
+            name='label', shape=[1], data_type='int32', program=program)
+        conv_pool_1 = nets.simple_img_conv_pool(
+            input=images,
+            filter_size=5,
+            num_filters=2,
+            pool_size=2,
+            pool_stride=2,
+            act="relu",
+            program=program)
+        conv_pool_2 = nets.simple_img_conv_pool(
+            input=conv_pool_1,
+            filter_size=5,
+            num_filters=4,
+            pool_size=2,
+            pool_stride=2,
+            act="relu",
+            program=program)
+
+        predict = layers.fc(input=conv_pool_2,
+                            size=10,
+                            act="softmax",
+                            program=program)
+        cost = layers.cross_entropy(input=predict, label=label, program=program)
+        avg_cost = layers.mean(x=cost, program=program)
+
+        program.append_backward(avg_cost)
+
+        print str(program)
+
+    def test_word_embedding(self):
+        program = Program()
+        dict_size = 10000
+        embed_size = 32
+        first_word = layers.data(
+            name='firstw', shape=[1], data_type='int32', program=program)
+        second_word = layers.data(
+            name='secondw', shape=[1], data_type='int32', program=program)
+        third_word = layers.data(
+            name='thirdw', shape=[1], data_type='int32', program=program)
+        forth_word = layers.data(
+            name='forthw', shape=[1], data_type='int32', program=program)
+        next_word = layers.data(
+            name='nextw', shape=[1], data_type='int32', program=program)
+
+        embed_param_attr_1 = {
+            'name': 'shared_w',
+            'init_attr': {
+                'max': 1.0,
+                'type': 'uniform_random',
+                'min': -1.0
+            }
+        }
+        embed_param_attr_2 = {'name': 'shared_w'}
+
+        embed_first = layers.embedding(
+            input=first_word,
+            size=[dict_size, embed_size],
+            data_type='float32',
+            param_attr=embed_param_attr_1,
+            program=program)
+        embed_second = layers.embedding(
+            input=second_word,
+            size=[dict_size, embed_size],
+            data_type='float32',
+            param_attr=embed_param_attr_2,
+            program=program)
+
+        embed_third = layers.embedding(
+            input=third_word,
+            size=[dict_size, embed_size],
+            data_type='float32',
+            param_attr=embed_param_attr_2,
+            program=program)
+        embed_forth = layers.embedding(
+            input=forth_word,
+            size=[dict_size, embed_size],
+            data_type='float32',
+            param_attr=embed_param_attr_2,
+            program=program)
+
+        concat_embed = layers.concat(
+            input=[embed_first, embed_second, embed_third, embed_forth],
+            axis=1,
+            program=program)
+
+        hidden1 = layers.fc(input=concat_embed,
+                            size=256,
+                            act='sigmoid',
+                            program=program)
+        predict_word = layers.fc(input=hidden1,
+                                 size=dict_size,
+                                 act='softmax',
+                                 program=program)
+        cost = layers.cross_entropy(
+            input=predict_word, label=next_word, program=program)
+        avg_cost = layers.mean(x=cost, program=program)
+        self.assertIsNotNone(avg_cost)
+
+        print str(program)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lookup_table_op.py b/python/paddle/v2/framework/tests/test_lookup_table_op.py
index b259bb67e832adcb31b0ab4e992738be2b85f884..2c48f9bf93b939aa631cd54e8fb14b5cba22f2e0 100644
--- a/python/paddle/v2/framework/tests/test_lookup_table_op.py
+++ b/python/paddle/v2/framework/tests/test_lookup_table_op.py
@@ -8,7 +8,8 @@ class TestLookupTableOp(OpTest):
         self.op_type = "lookup_table"
         table = np.random.random((17, 31)).astype("float32")
         ids = np.random.randint(0, 17, 4).astype("int32")
-        self.inputs = {'W': table, 'Ids': ids}
+        ids_expand = np.expand_dims(ids, axis=1)
+        self.inputs = {'W': table, 'Ids': ids_expand}
         self.outputs = {'Out': table[ids]}
 
     def test_check_output(self):
diff --git a/python/paddle/v2/framework/tests/test_lstm_op.py b/python/paddle/v2/framework/tests/test_lstm_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..93a4e450e916716e27573d192bace73f271733de
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_lstm_op.py
@@ -0,0 +1,189 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+SIGMOID_THRESHOLD_MIN = -40.0
+SIGMOID_THRESHOLD_MAX = 13.0
+EXP_MAX_INPUT = 40.0
+
+
+def identity(x):
+    return x
+
+
+def sigmoid(x):
+    y = np.copy(x)
+    y[x < SIGMOID_THRESHOLD_MIN] = SIGMOID_THRESHOLD_MIN
+    y[x > SIGMOID_THRESHOLD_MAX] = SIGMOID_THRESHOLD_MAX
+    return 1. / (1. + np.exp(-y))
+
+
+def tanh(x):
+    y = -2. * x
+    y[y > EXP_MAX_INPUT] = EXP_MAX_INPUT
+    return (2. / (1. + np.exp(y))) - 1.
+
+
+def relu(x):
+    return np.maximum(x, 0)
+
+
+ACTVATION = {
+    'identity': identity,
+    'sigmoid': sigmoid,
+    'tanh': tanh,
+    'relu': relu
+}
+
+
+def lstm(
+        input,  # T x 4D
+        lod,  # 1 x N
+        h0=None,  # N x D
+        c0=None,  # N x D
+        w_h=None,  # D x 4D
+        w_b=None,  # 1 x 4D
+        w_c=None,  # 1 x 3D
+        is_reverse=False,
+        act_gate=None,
+        act_cell=None,
+        act_cand=None):
+    def _step(x, w_h, w_c, h_pre, c_pre, act_gate, act_cell, act_cand):
+        g = np.dot(h_pre, w_h)  # 1 x 4D
+        g = g + x
+        g = np.reshape(g, (1, g.size))
+        c_tmp, g_i, g_f, g_o = np.split(g, 4, axis=1)
+        if w_c is None:
+            g_i = act_gate(g_i)  # 1 x D
+            g_f = act_gate(g_f)  # 1 x D
+        else:
+            w_ic, w_fc, w_oc = np.split(w_c, 3, axis=1)
+            g_i = act_gate(g_i + w_ic * c_pre)  # 1 x D
+            g_f = act_gate(g_f + w_fc * c_pre)  # 1 x D
+        c = g_f * c_pre + g_i * act_cand(c_tmp)  # 1 x D
+
+        if w_c is None:
+            g_o = act_gate(g_o)  # 1 x D
+        else:
+            _, _, w_oc = np.split(w_c, 3, axis=1)
+            g_o = act_gate(g_o + w_oc * c)  # 1 x D
+        h = g_o * act_cell(c)
+        bg = np.concatenate((act_cand(c_tmp), g_i, g_f, g_o), axis=1)
+        return h, c, bg
+
+    def _reverse(x, lod):
+        y = np.zeros_like(x)
+        for i in range(len(lod) - 1):
+            b, e = lod[i], lod[i + 1]
+            y[b:e, :] = np.flip(x[b:e, :], 0)
+        return y
+
+    offset = lod[0]
+    batch_size = len(offset) - 1
+    hidden = []
+    cell = []
+    gate = []
+    input = _reverse(input, offset) if is_reverse else input
+    if w_b is not None:
+        input = input + np.tile(w_b, (offset[-1], 1))
+    for i in range(batch_size):
+        # compute one sequence
+        seq_len = offset[i + 1] - offset[i]
+        x = input[offset[i]:offset[i + 1], :]
+        h_pre = h0[i]  # 1 x D
+        c_pre = c0[i]  # 1 x D
+        for j in range(seq_len):
+            # compute one step
+            h_pre, c_pre, g_pre = _step(x[j], w_h, w_c, h_pre, c_pre, act_gate,
+                                        act_cell, act_cand)
+            hidden.append(h_pre.flatten())
+            cell.append(c_pre.flatten())
+            gate.append(g_pre.flatten())
+
+    hidden = np.array(hidden).astype("float64")
+    cell = np.array(cell).astype("float64")
+    gate = np.array(gate).astype("float64")
+
+    hidden = _reverse(hidden, offset) if is_reverse else hidden
+    cell = _reverse(cell, offset) if is_reverse else cell
+
+    assert gate.shape == input.shape
+    assert hidden.shape == (input.shape[0], input.shape[1] / 4)
+    assert cell.shape == (input.shape[0], input.shape[1] / 4)
+    return hidden, cell, gate
+
+
+class TestLstmOp(OpTest):
+    def set_data(self):
+        self.lod = [[0, 2, 6, 9]]
+        self.D = 64
+        self.sort_idx = [2, 6, 0, 3, 7, 1, 4, 8, 5]
+
+        self.act_gate = "sigmoid"
+        self.act_cell = "tanh"
+        self.act_cand = "tanh"
+
+        self.is_reverse = False
+
+    def setUp(self):
+        self.set_data()
+        self.op_type = "lstm"
+
+        T = self.lod[0][-1]
+        N = len(self.lod[0]) - 1
+
+        x = np.random.normal(size=(T, 4 * self.D)).astype("float64")
+        h0 = np.zeros((N, self.D)).astype("float64")
+        c0 = np.zeros((N, self.D)).astype("float64")
+        w = np.random.normal(size=(self.D, 4 * self.D)).astype("float64")
+        b = np.random.normal(size=(1, 7 * self.D)).astype("float64")
+
+        w_b = b[:, 0:4 * self.D]
+        w_c = b[:, 4 * self.D:]
+        h, c, g = lstm(x, self.lod, h0, c0, w, w_b, w_c, self.is_reverse,
+                       ACTVATION[self.act_gate], ACTVATION[self.act_cell],
+                       ACTVATION[self.act_cand])
+
+        g_sort = np.zeros_like(x)
+        for i, j in enumerate(self.sort_idx):
+            g_sort[i, :] = g[j, :]
+
+        self.inputs = {
+            'Input': (x, self.lod),
+            'H0': h0,
+            'C0': c0,
+            'Weight': w,
+            'Bias': b
+        }
+        self.outputs = {
+            'Hidden': (h, self.lod),
+            'Cell': (c, self.lod),
+            'BatchGate': g_sort
+        }
+        self.attrs = {
+            'usePeepholes': True,
+            'isReverse': self.is_reverse,
+            'gateActivation': 'sigmoid',
+            'cellActivation': 'tanh',
+            'candidateActivation': 'tanh'
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLstmOpRerverse(TestLstmOp):
+    def set_data(self):
+        self.lod = [[0, 2, 6, 9]]
+        self.D = 64
+        self.sort_idx = [2, 6, 0, 3, 7, 1, 4, 8, 5]
+
+        self.act_gate = "sigmoid"
+        self.act_cell = "tanh"
+        self.act_cand = "tanh"
+
+        self.is_reverse = True
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_lstm_unit_op.py b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
index 8ce65bfc31d9fa2d3988759a197e2f497b8161b1..365ee560e14e322cd8cfcdc068a8b004f6e365ad 100644
--- a/python/paddle/v2/framework/tests/test_lstm_unit_op.py
+++ b/python/paddle/v2/framework/tests/test_lstm_unit_op.py
@@ -14,8 +14,8 @@ def tanh_np(x):
 class LstmUnitTest(OpTest):
     def setUp(self):
         self.op_type = "lstm_unit"
-        x_np = np.random.normal(size=(5, 16)).astype("float32")
-        c_np = np.random.normal(size=(5, 4)).astype("float32")
+        x_np = np.random.normal(size=(5, 16)).astype("float64")
+        c_np = np.random.normal(size=(5, 4)).astype("float64")
         i_np, f_np, o_np, j_np = np.split(x_np, 4, axis=1)
         forget_bias_np = 0.
         self.attrs = {'forget_bias': 0.}
@@ -31,7 +31,7 @@ class LstmUnitTest(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['X', 'C_prev'], ['C', 'H'], max_relative_error=0.01)
+        self.check_grad(['X', 'C_prev'], ['C', 'H'])
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..63378cbc4ec95d7d3c49a92f750b55a8dbc22414
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_margin_rank_loss_op.py
@@ -0,0 +1,39 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMarginRankLossOp(OpTest):
+    def setUp(self):
+        self.op_type = "margin_rank_loss"
+        batch_size = 5
+        margin = 0.5
+        # labels_{i} = {-1, 1}
+        label = 2 * np.random.randint(
+            0, 2, size=(batch_size, 1)).astype("float32") - 1
+        x1 = np.random.random((batch_size, 1)).astype("float32")
+        x2 = np.random.random((batch_size, 1)).astype("float32")
+        # loss = max(0, -label * (x1 - x2) + margin)
+        loss = -label * (x1 - x2) + margin
+        loss = np.where(loss > 0, loss, 0)
+        act = np.where(loss > 0, 1., 0.)
+
+        self.attrs = {'margin': margin}
+        self.inputs = {'Label': label, 'X1': x1, 'X2': x2}
+        self.outputs = {'Activated': act, 'Out': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X1", "X2"], "Out")
+
+    def test_check_grad_ignore_x1(self):
+        self.check_grad(["X2"], "Out", no_grad_set=set('X1'))
+
+    def test_check_grad_ignore_x2(self):
+        self.check_grad(["X1"], "Out", no_grad_set=set('X2'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_matmul_op.py b/python/paddle/v2/framework/tests/test_matmul_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d51572c8ab7c44fa0c6e83e50b56f05780530c61
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_matmul_op.py
@@ -0,0 +1,119 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def generate_compatible_shapes(dim_X, dim_Y, transpose_X, transpose_Y):
+    BATCH_SIZE = 2
+    M = 3
+    N = 4
+    K = 5
+    if (dim_X == 1 and transpose_X) or (dim_Y == 1 and transpose_Y):
+        K = 1
+    if dim_X == 1:
+        if transpose_X:
+            shape_X = [M]
+        else:
+            shape_X = [K]
+    if dim_Y == 1:
+        if transpose_Y:
+            shape_Y = [N]
+        else:
+            shape_Y = [K]
+    if dim_X >= 2:
+        if transpose_X:
+            shape_X = [K, M]
+        else:
+            shape_X = [M, K]
+    if dim_X == 3:
+        shape_X = [BATCH_SIZE] + shape_X
+    if dim_Y >= 2:
+        if transpose_Y:
+            shape_Y = [N, K]
+        else:
+            shape_Y = [K, N]
+    if dim_Y == 3:
+        shape_Y = [BATCH_SIZE] + shape_Y
+    return shape_X, shape_Y
+
+
+def reference_matmul(X, Y, transpose_X=False, transpose_Y=False):
+    """Reference forward implementation using np.matmul."""
+    # np.matmul does not support the transpose flags, so we manually
+    # transpose X and Y appropriately.
+    if transpose_X:
+        if X.ndim == 1:
+            X = X.reshape((X.size, 1))
+        elif X.ndim == 2:
+            X = X.T
+        elif X.ndim == 3:
+            X = np.transpose(X, (0, 2, 1))
+        else:
+            raise ValueError('X must have between 1 and 3 dimensions')
+    if transpose_Y:
+        if Y.ndim == 1:
+            Y = Y.reshape((1, Y.size))
+        elif Y.ndim == 2:
+            Y = Y.T
+        elif Y.ndim == 3:
+            Y = np.transpose(Y, (0, 2, 1))
+        else:
+            raise ValueError('Y must have between 1 and 3 dimensions')
+    Out = np.matmul(X, Y)
+    if not Out.shape:
+        # We do not support 0-dimensional Tensors (scalars). So where
+        # np.matmul outputs a scalar, we must convert to a Tensor of
+        # shape (1, ) instead.
+        # Everywhere else, we are compatible with np.matmul.
+        Out = np.array([Out], dtype="float32")
+    return Out
+
+
+class Generator(object):
+    def setUp(self):
+        self.op_type = "matmul"
+        X = np.random.random(self.shape_X).astype("float32")
+        Y = np.random.random(self.shape_Y).astype("float32")
+        Out = reference_matmul(X, Y, self.transpose_X, self.transpose_Y)
+        self.inputs = {'X': X, 'Y': Y}
+        self.attrs = {
+            'transpose_X': self.transpose_X,
+            'transpose_Y': self.transpose_Y
+        }
+        self.outputs = {'Out': Out}
+
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out', max_relative_error=0.5)
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(
+            ['Y'], 'Out', max_relative_error=0.5, no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(
+            ['X'], 'Out', max_relative_error=0.5, no_grad_set=set('Y'))
+
+
+# Generate test cases for all possibilities
+for dim_X in [1, 2, 3]:
+    for dim_Y in [1, 2, 3]:
+        for transpose_X in [False, True]:
+            for transpose_Y in [False, True]:
+                test_name = (
+                    'TestMatMulOp_dimX_{}_dim_Y_{}_transX_{}_transY_{}'.format(
+                        dim_X, dim_Y, transpose_X, transpose_Y))
+                shape_X, shape_Y = generate_compatible_shapes(
+                    dim_X, dim_Y, transpose_X, transpose_Y)
+                test_class = type(test_name, (Generator, OpTest), {
+                    'shape_X': shape_X,
+                    'shape_Y': shape_Y,
+                    'transpose_X': transpose_X,
+                    'transpose_Y': transpose_Y,
+                })
+                globals()[test_name] = test_class
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mnist.py b/python/paddle/v2/framework/tests/test_mnist.py
index 169242b5372ebd28f102e0b450495524c712aabe..c8d54b7c94b7815fa79e5a11f4e159657dc2a6cb 100644
--- a/python/paddle/v2/framework/tests/test_mnist.py
+++ b/python/paddle/v2/framework/tests/test_mnist.py
@@ -31,7 +31,7 @@ uniq_id = atomic_id().next
 
 
 def data_layer(name, dims):
-    var = scope.new_var(name)
+    var = scope.var(name)
     tensor = var.get_tensor()
     tensor.set_dims(dims)  # 1 is batch size holder.
     return name
@@ -67,7 +67,7 @@ def sgd_optimizer(net, param_name, learning_rate=0.005):
 
 # should use operator and add these to the init_network
 def init_param(net, param_name, dims):
-    scope.new_var(param_name)
+    scope.var(param_name)
     op = Operator(
         "uniform_random", Out=param_name, dims=dims, min=-0.5, max=0.5, seed=10)
     op.infer_shape(scope)
@@ -104,7 +104,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
     sgd_optimizer(net=optimize_net, param_name=w_name, learning_rate=0.01)
 
     pre_activation = name + ".mul.out"
-    scope.new_var(pre_activation)
+    scope.var(pre_activation)
     mul_op = Operator("mul", X=input, Y=w_name, Out=pre_activation)
     net.append_op(mul_op)
 
@@ -115,7 +115,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
         sgd_optimizer(
             net=optimize_net, param_name=bias_name, learning_rate=0.001)
         bias_out = name + ".rowwise_add.out"
-        scope.new_var(bias_out)
+        scope.var(bias_out)
         rowwise_append_op = Operator(
             "rowwise_add", X=pre_activation, b=bias_name, Out=bias_out)
         net.append_op(rowwise_append_op)
@@ -123,7 +123,7 @@ def fc_layer(net, input, size, act="softmax", bias=True, param=None, name=None):
 
     activation_op = Operator(act, X=pre_activation, Y=name)
     net.append_op(activation_op)
-    scope.new_var(name)
+    scope.var(name)
     net.infer_shape(scope)
     return name
 
@@ -133,7 +133,7 @@ def cross_entropy_layer(net, input, label):
     cross_entropy_op = Operator(
         "cross_entropy", X=input, Label=label, Y=cost_name)
     net.append_op(cross_entropy_op)
-    scope.new_var(cost_name)
+    scope.var(cost_name)
     net.infer_shape(scope)
     return cost_name
 
@@ -141,10 +141,10 @@ def cross_entropy_layer(net, input, label):
 def create_backward_net(forward_net):
     net = core.Operator.backward(forward_net, set())
     for input in net.inputs()["all"]:
-        var = scope.new_var(input)
+        var = scope.var(input)
         var.get_tensor()
     for output in net.outputs()["all"]:
-        var = scope.new_var(output)
+        var = scope.var(output)
         var.get_tensor()
     return net
 
diff --git a/python/paddle/v2/framework/tests/test_momentum_op.py b/python/paddle/v2/framework/tests/test_momentum_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..654d31975aab4578055e7e70ade202bd2c3d93cb
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_momentum_op.py
@@ -0,0 +1,76 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestMomentumOp1(OpTest):
+    def setUp(self):
+        self.op_type = "momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        use_nesterov = False
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {'mu': mu}
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - grad * learning_rate + \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMomentumOp2(OpTest):
+    '''Test Momentum with defaukt values for attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "momentum"
+
+        param = np.random.random((123, 321)).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        velocity = np.zeros((123, 321)).astype("float32")
+        learning_rate = np.array([0.001]).astype("float32")
+        mu = 0.0001
+        use_nesterov = True
+
+        self.inputs = {
+            'Param': param,
+            'Grad': grad,
+            'Velocity': velocity,
+            'LearningRate': learning_rate
+        }
+
+        self.attrs = {'mu': mu, 'useNesterov': use_nesterov}
+
+        velocity_out = mu * velocity + grad
+        if use_nesterov:
+            param_out = param - grad * learning_rate + \
+                        velocity_out * mu * learning_rate
+        else:
+            param_out = param - learning_rate * velocity_out
+
+        self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
index b3d95a56b88e510734da54f36ff21ccd7e1baabb..57d6d7e7e095cab2c3afb60d229fc09da98aed8b 100644
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -35,10 +35,10 @@ class TestMulOp2(OpTest):
             'Y': np.random.random((4, 30, 8, 2, 9)).astype("float32")
         }
         self.attrs = {'x_num_col_dims': 2, 'y_num_col_dims': 2}
-        self.outputs = {
-            'Out': np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
-                          self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
-        }
+        result = np.dot(self.inputs['X'].reshape(15 * 4, 12 * 10),
+                        self.inputs['Y'].reshape(4 * 30, 8 * 2 * 9))
+        result = result.reshape(15, 4, 8, 2, 9)
+        self.outputs = {'Out': result}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index 50cfb855f2b01d8fd32342855d46716da7e07856..8503257feb8e1a5802f3f889f72c559a2aaa583a 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -15,7 +15,7 @@ def fc(X, W, Y):
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
-        op1 = Operator("add", X="X", Y="Y", Out="Out")
+        op1 = Operator("sum", X=["X", "Y"], Out="Out")
         net.append_op(op1)
 
         net2 = core.Net.create()
@@ -26,7 +26,7 @@ class TestNet(unittest.TestCase):
 
         expected = '''
 Op(plain_net), inputs:{all[W, X, Y]}, outputs:{all[Out, fc.out, pre_activation]}.
-    Op(add), inputs:{X[X], Y[Y]}, outputs:{Out[Out]}.
+    Op(sum), inputs:{X[X, Y]}, outputs:{Out[Out]}.
     Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
         Op(plain_net), inputs:{all[W, X]}, outputs:{all[fc.out, pre_activation]}.
             Op(mul), inputs:{X[X], Y[W]}, outputs:{Out[pre_activation]}.
diff --git a/python/paddle/v2/framework/tests/test_op_support_gpu.py b/python/paddle/v2/framework/tests/test_op_support_gpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd36c666c440a5c378dfceac4502cd8277417412
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_op_support_gpu.py
@@ -0,0 +1,11 @@
+import unittest
+import paddle.v2.framework.core as core
+
+
+class TestOpSupportGPU(unittest.TestCase):
+    def test_case(self):
+        self.assertEqual(core.is_compile_gpu(), core.op_support_gpu("sum"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_operator.py b/python/paddle/v2/framework/tests/test_operator.py
index 040556322d79cbb594eb9af585a5b9920d7ab625..98f6b2f5ee639120557cb85b3ada6d2931f7d0d2 100644
--- a/python/paddle/v2/framework/tests/test_operator.py
+++ b/python/paddle/v2/framework/tests/test_operator.py
@@ -193,10 +193,10 @@ class TestOpDescCreationMethod(unittest.TestCase):
 
 class TestOpCreations(unittest.TestCase):
     def test_all(self):
-        add_op = op.Operator("add", X="a", Y="b", Out="z")
+        add_op = op.Operator("sum", X=["a", "b"], Out="z")
         self.assertIsNotNone(add_op)
         # Invoke C++ DebugString()
-        self.assertEqual('Op(add), inputs:{X[a], Y[b]}, outputs:{Out[z]}.',
+        self.assertEqual('Op(sum), inputs:{X[a, b]}, outputs:{Out[z]}.',
                          str(add_op))
 
 
diff --git a/python/paddle/v2/framework/tests/test_operator_desc.py b/python/paddle/v2/framework/tests/test_operator_desc.py
new file mode 100644
index 0000000000000000000000000000000000000000..af4e980b8ed6db6cb9b76de49d8dc0860f07ec80
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_operator_desc.py
@@ -0,0 +1,78 @@
+import unittest
+from paddle.v2.framework.framework import Variable, g_program
+import paddle.v2.framework.core as core
+
+
+class TestOperator(unittest.TestCase):
+    def test_error_type(self):
+        block = g_program.create_block()
+        try:
+            block.append_op()
+            self.assertFail()
+        except ValueError as v_err:
+            self.assertEqual(
+                v_err.message,
+                "`type` to initilized an Operator can not be None.")
+        try:
+            block.append_op(type="no_such_op")
+            self.assertFail()
+        except ValueError as a_err:
+            self.assertEqual(a_err.message,
+                             "Operator \"no_such_op\" has not been registered.")
+
+    def test_op_desc_creation(self):
+        block = g_program.current_block()
+        mul_x = block.create_var(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_op = block.append_op(
+            type="mul",
+            inputs={"X": [mul_x],
+                    "Y": mul_y},
+            outputs={"Out": [mul_out]},
+            attrs={"x_num_col_dims": 1})
+
+        self.assertNotEqual(str(mul_op), "")
+        self.assertEqual(mul_op.type, "mul")
+        self.assertEqual(mul_op.input_names, ["X", "Y"])
+        self.assertEqual(mul_op.input("X"), ["mul.x"])
+        self.assertEqual(mul_op.input("Y"), ["mul.y"])
+        self.assertEqual(mul_op.output_names, ["Out"])
+        self.assertEqual(mul_op.output("Out"), ["mul.out"])
+        self.assertEqual(
+            set(mul_op.attr_names), set(["x_num_col_dims", "y_num_col_dims"]))
+        self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
+        self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
+        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
+        self.assertEqual(mul_op.has_attr("y_num_col_dims"), True)
+        self.assertEqual(mul_op.attr_type("y_num_col_dims"), core.AttrType.INT)
+        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
+        self.assertEqual(mul_out.op, mul_op)
+
+    def test_mult_input(self):
+        block = g_program.current_block()
+        sum_x1 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x1")
+        sum_x2 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x2")
+        sum_x3 = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.x3")
+        sum_out = block.create_var(
+            dtype="int", shape=[3, 4], lod_level=0, name="sum.out")
+        sum_op = block.append_op(
+            type="sum",
+            inputs={"X": [sum_x1, sum_x2, sum_x3]},
+            outputs={"Out": sum_out})
+        self.assertEqual(sum_op.type, "sum")
+        self.assertEqual(sum_op.input_names, ["X"])
+        self.assertEqual(sum_op.input("X"), ["sum.x1", "sum.x2", "sum.x3"])
+        self.assertEqual(sum_op.output_names, ["Out"])
+        self.assertEqual(sum_op.output("Out"), ["sum.out"])
+        self.assertEqual(sum_out.op, sum_op)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_optimizer.py b/python/paddle/v2/framework/tests/test_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb5d49bcbafe46ddb5ce96c8565417cf9bedc668
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_optimizer.py
@@ -0,0 +1,164 @@
+import unittest
+
+import paddle.v2.framework.framework as framework
+import paddle.v2.framework.optimizer as optimizer
+from paddle.v2.framework.backward import append_backward_ops
+
+
+class TestOptimizer(unittest.TestCase):
+    def test_sgd_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.01)
+        opts = sgd_optimizer.minimize(mul_out)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "sgd")
+
+
+class TestMomentumOptimizer(unittest.TestCase):
+    class MockMomentum(optimizer.MomentumOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_velocity_str(self):
+            return self._velocity_acc_str
+
+    def test_momentum_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        momentum_optimizer = self.MockMomentum(learning_rate=0.01, momentum=0.2)
+        params_grads = append_backward_ops(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
+        opts = momentum_optimizer.create_optimization_pass(params_grads,
+                                                           mul_out)
+        self.assertEqual(len(opts), 1)
+        sgd_op = opts[0]
+        self.assertEqual(sgd_op.type, "momentum")
+
+        # Check accumulators
+        accumulators = momentum_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(momentum_optimizer.get_velocity_str() in accumulators)
+        velocity_acc = accumulators[momentum_optimizer.get_velocity_str()]
+        self.assertEqual(len(velocity_acc), 1)
+        self.assertTrue(mul_x.name in velocity_acc)
+
+
+class TestAdagradOptimizer(unittest.TestCase):
+    class MockAdagrad(optimizer.AdagradOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment_str(self):
+            return self._moment_acc_str
+
+    def test_adagrad_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        adagrad_optimizer = self.MockAdagrad(learning_rate=0.01, epsilon=1.0e-6)
+        params_grads = append_backward_ops(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
+        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out)
+        self.assertEqual(len(opts), 1)
+        adagrad_op = opts[0]
+        self.assertEqual(adagrad_op.type, "adagrad")
+
+        # check accumulators
+        accumulators = adagrad_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 1)
+        self.assertTrue(adagrad_optimizer.get_moment_str() in accumulators)
+        moment_acc = accumulators[adagrad_optimizer.get_moment_str()]
+        self.assertEqual(len(moment_acc), 1)
+        self.assertTrue(mul_x.name in moment_acc)
+
+
+class TestAdamOptimizer(unittest.TestCase):
+    class MockAdam(optimizer.AdamOptimizer):
+        def get_accumulators(self):
+            return self._accumulators
+
+        def get_moment1_str(self):
+            return self._moment1_acc_str
+
+        def get_moment2_str(self):
+            return self._moment2_acc_str
+
+    def test_adam_optimizer(self):
+        program = framework.Program()
+        block = program.global_block()
+        mul_x = block.create_parameter(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        block.append_op(
+            type="mul",
+            inputs={"X": mul_x,
+                    "Y": mul_y},
+            outputs={"Out": mul_out},
+            attrs={"x_num_col_dims": 1})
+        adam_optimizer = self.MockAdam(
+            learning_rate=0.01, beta1=0.9, beta2=0.999)
+        params_grads = append_backward_ops(mul_out)
+        self.assertEqual(len(params_grads), 1)
+        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
+        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out)
+        self.assertEqual(len(opts), 3)
+        adam_op = opts[0]
+        self.assertEqual(adam_op.type, "adam")
+
+        # Check accumulators
+        accumulators = adam_optimizer.get_accumulators()
+        self.assertEqual(len(accumulators), 2)
+        self.assertTrue(adam_optimizer.get_moment1_str() in accumulators)
+        self.assertTrue(adam_optimizer.get_moment2_str() in accumulators)
+        moment1_acc = accumulators[adam_optimizer.get_moment1_str()]
+        moment2_acc = accumulators[adam_optimizer.get_moment2_str()]
+        self.assertEqual(len(moment1_acc), 1)
+        self.assertEqual(len(moment2_acc), 1)
+        self.assertTrue(mul_x.name in moment1_acc)
+        self.assertTrue(mul_x.name in moment2_acc)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pad_op.py b/python/paddle/v2/framework/tests/test_pad_op.py
index 9052e63b5683801da7c73be4de23013c949add98..55f1774e5755c846f60a2f1df3e705444a81192b 100644
--- a/python/paddle/v2/framework/tests/test_pad_op.py
+++ b/python/paddle/v2/framework/tests/test_pad_op.py
@@ -27,7 +27,7 @@ class TestPadOp(OpTest):
     def initTestCase(self):
         self.shape = (16, 16)
         self.paddings = [(0, 1), (2, 3)]
-        self.pad_value = 0
+        self.pad_value = 0.0
 
 
 class TestCase1(TestPadOp):
@@ -41,7 +41,7 @@ class TestCase2(TestPadOp):
     def initTestCase(self):
         self.shape = (2, 2, 2)
         self.paddings = [(0, 0), (0, 0), (1, 2)]
-        self.pad_value = 1
+        self.pad_value = 1.0
 
 
 class TestCase3(TestPadOp):
diff --git a/python/paddle/v2/framework/tests/test_parameter.py b/python/paddle/v2/framework/tests/test_parameter.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ac0cdd99f1b7c15d64ae9d2c465d5a9d563bd80
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_parameter.py
@@ -0,0 +1,27 @@
+import unittest
+from paddle.v2.framework.framework import g_program
+import paddle.v2.framework.core as core
+
+
+class TestParameter(unittest.TestCase):
+    def test_param(self):
+        b = g_program.create_block()
+        param = b.create_parameter(
+            name='fc.w',
+            shape=[784, 100],
+            dtype='float32',
+            initialize_attr={
+                'type': 'uniform_random',
+                'seed': 13,
+                'min': -5.0,
+                'max': 5.0
+            })
+        self.assertIsNotNone(param)
+        self.assertEqual('fc.w', param.name)
+        self.assertEqual((784, 100), param.shape)
+        self.assertEqual(core.DataType.FP32, param.data_type)
+        self.assertEqual(0, param.block.idx)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool2d_op.py b/python/paddle/v2/framework/tests/test_pool2d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fcd8941d4f8a8638db0009b368734c234e702f6
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool2d_op.py
@@ -0,0 +1,144 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+    return out
+
+
+def avg_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / (
+                (r_end - r_start) * (c_end - c_start))
+    return out
+
+
+class TestPool2d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool2D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase1(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase2(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "avg"
+        self.pool2D_forward_naive = avg_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+class TestCase3(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase4(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0]
+
+
+class TestCase5(TestPool2d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool2d"
+        self.pool_type = "max"
+        self.pool2D_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool3d_op.py b/python/paddle/v2/framework/tests/test_pool3d_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4e938041fa0ae9d0760023afdbf2f3052b244ea
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool3d_op.py
@@ -0,0 +1,152 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+    return out
+
+
+def avg_pool3D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / (
+                    (d_end - d_start) * (h_end - h_start) * (w_end - w_start))
+    return out
+
+
+class TestPool3d_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output = self.pool3D_forward_naive(input, self.ksize, self.strides,
+                                           self.paddings, self.global_pool)
+        self.inputs = {'X': input}
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'pooling_type': self.pool_type,
+            'global_pooling': self.global_pool,
+        }
+
+        self.outputs = {'Out': output}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        if self.pool_type != "max":
+            self.check_grad(set(['X']), 'Out', max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase1(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase2(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "avg"
+        self.pool3D_forward_naive = avg_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase3(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase4(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase5(TestPool3d_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "pool3d"
+        self.pool_type = "max"
+        self.pool3D_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_pool_max_op.py b/python/paddle/v2/framework/tests/test_pool_max_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b78f9bba05c5af38806f6cabb0e53379f8aa0526
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_pool_max_op.py
@@ -0,0 +1,212 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def max_pool3D_forward_naive(x,
+                             ksize,
+                             strides,
+                             paddings=[0, 0, 0],
+                             global_pool=0):
+
+    N, C, D, H, W = x.shape
+    if global_pool == 1:
+        ksize = [D, H, W]
+    D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+    out = np.zeros((N, C, D_out, H_out, W_out))
+    mask = np.zeros((N, C, D_out, H_out, W_out))
+    for k in xrange(D_out):
+        d_start = np.max((k * strides[0] - paddings[0], 0))
+        d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D))
+        for i in xrange(H_out):
+            h_start = np.max((i * strides[0] - paddings[0], 0))
+            h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            for j in xrange(W_out):
+                w_start = np.max((j * strides[1] - paddings[1], 0))
+                w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+                x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end]
+
+                out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4))
+
+                for n in xrange(N):
+                    for c in xrange(C):
+                        arr = x_masked[n, c, :, :, :]
+                        index = np.where(arr == np.max(arr))
+                        sub_deep = index[0][0]
+                        sub_row = index[1][0]
+                        sub_col = index[2][0]
+                        index = ((d_start + sub_deep) * H +
+                                 (h_start + sub_row)) * W + w_start + sub_col
+                        mask[n, c, k, i, j] = index
+
+    return out, mask
+
+
+def max_pool2D_forward_naive(x, ksize, strides, paddings=[0, 0], global_pool=0):
+
+    N, C, H, W = x.shape
+    if global_pool == 1:
+        ksize = [H, W]
+    H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
+    W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+    out = np.zeros((N, C, H_out, W_out))
+    mask = np.zeros((N, C, H_out, W_out))
+    for i in xrange(H_out):
+        for j in xrange(W_out):
+            r_start = np.max((i * strides[0] - paddings[0], 0))
+            r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H))
+            c_start = np.max((j * strides[1] - paddings[1], 0))
+            c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W))
+            x_masked = x[:, :, r_start:r_end, c_start:c_end]
+
+            out[:, :, i, j] = np.max(x_masked, axis=(2, 3))
+
+            for n in xrange(N):
+                for c in xrange(C):
+                    arr = x_masked[n, c, :, :]
+                    index = np.where(arr == np.max(arr))
+                    sub_row = index[0][0]
+                    sub_col = index[1][0]
+                    index = (r_start + sub_row) * W + c_start + sub_col
+                    mask[n, c, i, j] = index
+
+    return out, mask
+
+
+class TestMaxPoolWithIndex_Op(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        input = np.random.random(self.shape).astype("float32")
+        output, mask = self.pool_forward_naive(input, self.ksize, self.strides,
+                                               self.paddings, self.global_pool)
+
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'global_pooling': self.global_pool,
+        }
+
+        self.inputs = {'X': input}
+        self.outputs = {'Out': output, "Mask": mask}
+
+    def test_check_output(self):
+        self.check_output()
+
+    # def test_check_grad(self):
+    #     self.check_grad(set(['X']), ['Out'], max_relative_error=0.07)
+
+    def initTestCase(self):
+        self.global_pool = True
+        self.index = "max_pool3d_with_index"
+        self.op_type = "%s" % self.index
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase1(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase2(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase3(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 7, 7, 7]
+        self.ksize = [3, 3, 3]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase4(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [1, 1, 1]
+        self.paddings = [1, 1, 1]
+
+
+class TestCase5(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "max_pool3d_with_index"
+        self.pool_forward_naive = max_pool3D_forward_naive
+        self.shape = [2, 3, 5, 5, 5]
+        self.ksize = [3, 3, 3]
+        self.strides = [2, 2, 2]
+        self.paddings = [0, 0, 0]
+
+
+class TestCase6(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "max_pool2d_with_index"
+        self.pool_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+class TestCase7(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = False
+        self.op_type = "max_pool2d_with_index"
+        self.pool_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 7, 7]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+class TestCase8(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "max_pool2d_with_index"
+        self.pool_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [1, 1]
+
+
+class TestCase9(TestMaxPoolWithIndex_Op):
+    def initTestCase(self):
+        self.global_pool = True
+        self.op_type = "max_pool2d_with_index"
+        self.pool_forward_naive = max_pool2D_forward_naive
+        self.shape = [2, 3, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_prelu_op.py b/python/paddle/v2/framework/tests/test_prelu_op.py
index 676fd9f7c555fd5c8544e760345ab954cd137dc5..7be932ac8f6b82283fecd32ac4b3b7bb9aff0338 100644
--- a/python/paddle/v2/framework/tests/test_prelu_op.py
+++ b/python/paddle/v2/framework/tests/test_prelu_op.py
@@ -17,7 +17,7 @@ class PReluTest(OpTest):
 
         x_np_sign = np.sign(x_np)
         x_np = x_np_sign * np.maximum(x_np, .005)
-        alpha_np = np.array([.1])
+        alpha_np = np.array([.1], dtype="float32")
         self.inputs = {'X': x_np, 'Alpha': alpha_np}
         out_np = np.maximum(self.inputs['X'], 0.)
         out_np = out_np + np.minimum(self.inputs['X'],
diff --git a/python/paddle/v2/framework/tests/test_program.py b/python/paddle/v2/framework/tests/test_program.py
new file mode 100644
index 0000000000000000000000000000000000000000..c55dd8de7282d4c941777054ad9d6437c87f0bc6
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_program.py
@@ -0,0 +1,103 @@
+import unittest
+
+import paddle.v2.framework.core as core
+from paddle.v2.framework.framework import Program
+from paddle.v2.framework.framework import g_program
+
+
+class TestProgram(unittest.TestCase):
+    def test_program(self):
+        b = g_program.current_block()
+        self.assertEqual(-1, b.parent_idx)
+        self.assertEqual(0, b.idx)
+
+        b = g_program.create_block()
+        self.assertEqual(1, b.idx)
+        self.assertEqual(0, b.parent_idx)
+
+        b = g_program.create_block()
+        self.assertEqual(2, b.idx)
+        self.assertEqual(1, b.parent_idx)
+
+        g_program.rollback()
+
+        b = g_program.current_block()
+        self.assertEqual(1, b.idx)
+        self.assertEqual(0, b.parent_idx)
+
+        b = g_program.create_block()
+        self.assertEqual(3, b.idx)
+        self.assertEqual(1, b.parent_idx)
+
+        g_program.rollback()
+        b = g_program.current_block()
+        self.assertEqual(1, b.idx)
+        self.assertEqual(0, b.parent_idx)
+
+    def test_program_clone(self):
+        prog = Program()
+
+        x = prog.global_block().create_var(
+            name='X', shape=[1000, 784], dtype='float32')
+
+        y = prog.global_block().create_var(
+            name='Y', shape=[784, 100], dtype='float32')
+        out = prog.global_block().create_var(name='Out', dtype='float32')
+        prog.global_block().append_op(
+            type="mul", inputs={'X': [x],
+                                'Y': [y]}, outputs={'Out': [out]})
+
+        # FIXME(yuyang18): We manual compare the output string, since the order
+        # of variable could be changed.
+        print prog
+        print prog.clone()
+
+    def test_append_backward(self):
+        prog = Program()
+        block = prog.global_block()
+
+        mul_x = block.create_var(
+            dtype="float32", shape=[5, 10], lod_level=0, name="mul.x")
+        mul_y = block.create_var(
+            dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
+        mul_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
+        mul_op = block.append_op(
+            type="mul",
+            inputs={"X": [mul_x],
+                    "Y": mul_y},
+            outputs={"Out": [mul_out]},
+            attrs={"x_num_col_dims": 1})
+
+        add_y = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="add.y")
+        add_out = block.create_var(
+            dtype="float32", shape=[5, 8], lod_level=0, name="add.out")
+        add_op = block.append_op(
+            type="elementwise_add",
+            inputs={"X": mul_out,
+                    "Y": add_y},
+            outputs={"Out": add_out},
+            attrs={"x_num_col_dims": 1})
+
+        param_to_grad = prog.append_backward(add_out, set())
+
+        def grad_name(name):
+            return name + "@GRAD"
+
+        for var_name in ("mul.x", "mul.y", "mul.out", "add.y", "add.out"):
+            self.assertEqual(param_to_grad[var_name][0], grad_name(var_name))
+            self.assertEqual(param_to_grad[var_name][1], 0)
+
+        expect_ops = [
+            "mul", "elementwise_add", "fill_constant", "elementwise_add_grad",
+            "mul_grad"
+        ]
+        actual_ops = []
+        for op in block.ops:
+            actual_ops.append(op.type)
+        self.assertEqual(actual_ops, expect_ops)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_protobuf_descs.py b/python/paddle/v2/framework/tests/test_protobuf_descs.py
index 2b7ba6688a65c466d5bc656178f2991da8dfe016..2fd3d5d165ada5026510e0dc3e2c55b6e0596ff3 100644
--- a/python/paddle/v2/framework/tests/test_protobuf_descs.py
+++ b/python/paddle/v2/framework/tests/test_protobuf_descs.py
@@ -4,7 +4,7 @@ import paddle.v2.framework.core as core
 
 class TestOpDesc(unittest.TestCase):
     def test_op_desc(self):
-        prog = core.ProgramDesc.__create_program_desc__()
+        prog = core.ProgramDesc()
         self.assertIsNotNone(prog)
         block = prog.block(0)
         self.assertIsNotNone(block)
@@ -53,21 +53,27 @@ class TestOpDesc(unittest.TestCase):
         self.assertEqual(8, len(op.attr_names()))
 
         op.set_block_attr("block_attr", prog.block(0))
-        self.assertEqual(0, op.get_block_attr("block_attr"))
+        self.assertEqual(0, op.block_attr("block_attr"))
+
+        mul_op = block.append_op()
+        mul_op.set_type("mul")
+        mul_op.check_attrs()
+        self.assertEqual(mul_op.attr("x_num_col_dims"), 1)
+        self.assertEqual(mul_op.attr("y_num_col_dims"), 1)
 
 
 class TestProgramDesc(unittest.TestCase):
     def test_instance(self):
-        program_desc = core.ProgramDesc.__create_program_desc__()
+        program_desc = core.ProgramDesc()
         self.assertIsNotNone(program_desc)
         del program_desc
-        program_desc = core.ProgramDesc.instance()
+        program_desc = core.ProgramDesc()
         self.assertIsNotNone(program_desc)
         self.assertIsNotNone(program_desc.block(0))
         del program_desc
 
     def test_append_block(self):
-        prog_desc = core.ProgramDesc.__create_program_desc__()
+        prog_desc = core.ProgramDesc()
         self.assertIsNotNone(prog_desc)
         block_root = prog_desc.block(0)
         self.assertIsNotNone(block_root)
@@ -85,45 +91,51 @@ class TestProgramDesc(unittest.TestCase):
 
 class TestVarDesc(unittest.TestCase):
     def test_shape(self):
-        program_desc = core.ProgramDesc.__create_program_desc__()
+        program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.new_var('my_var')
+        var = block.var('my_var')
+        var.set_type(core.VarDesc.VarType.SELECTED_ROWS)
         src_shape = [3, 2, 10, 8]
         var.set_shape(src_shape)
         res_shape = var.shape()
         self.assertEqual(src_shape, res_shape)
+        self.assertEqual(core.VarDesc.VarType.SELECTED_ROWS, var.type())
 
     def test_data_type(self):
-        program_desc = core.ProgramDesc.__create_program_desc__()
+        program_desc = core.ProgramDesc()
         block = program_desc.block(0)
-        var = block.new_var('my_var')
+        var = block.var('my_var')
+        var.set_type(core.VarDesc.VarType.LOD_TENSOR)
         var.set_data_type(core.DataType.INT32)
         self.assertEqual(core.DataType.INT32, var.data_type())
+        self.assertEqual(core.VarDesc.VarType.LOD_TENSOR, var.type())
 
 
 class TestBlockDesc(unittest.TestCase):
     def test_add_var(self):
-        prog = core.ProgramDesc.__create_program_desc__()
+        prog = core.ProgramDesc()
         self.assertIsNotNone(prog)
         block = prog.block(0)
         self.assertIsNotNone(block)
-        var1 = block.new_var("var1")
-        var2 = block.new_var("var2")
-        var3 = block.new_var("var3")
+        var1 = block.var("var1")
+        var2 = block.var("var2")
+        var3 = block.var("var3")
         all_vars = block.all_vars()
-        self.assertEqual(set(all_vars), set([var1, var2, var3]))
-        var2_re = block.var("var2")
+        self.assertEqual(set(all_vars), {var1, var2, var3})
+        var2_re = block.find_var("var2")
         self.assertEqual(var2_re, var2)
 
     def test_add_op(self):
-        prog = core.ProgramDesc.__create_program_desc__()
+        prog = core.ProgramDesc()
         self.assertIsNotNone(prog)
         block = prog.block(0)
         self.assertIsNotNone(block)
         op1 = block.append_op()
         op2 = block.append_op()
         op0 = block.prepend_op()
-        all_ops = block.all_ops()
+        all_ops = []
+        for idx in xrange(0, block.op_size()):
+            all_ops.append(block.op(idx))
         self.assertEqual(all_ops, [op0, op1, op2])
 
 
diff --git a/python/paddle/v2/framework/tests/test_proximal_gd_op.py b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ca79ce6b3b710244e4f65db70b305231a9f3fcf
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_proximal_gd_op.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestProximalGDOp(OpTest):
+    def setUp(self):
+        self.op_type = "proximal_gd"
+        w = np.random.random((102, 105)).astype("float32")
+        g = np.random.random((102, 105)).astype("float32")
+        lr = np.array([0.1]).astype("float32")
+        l1 = 0.1
+        l2 = 0.2
+
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.attrs = {'l1': l1, 'l2': l2}
+        prox_param = w - lr * g
+        param_out = 0.0
+        if l1 > 0.0:
+            x = np.abs(prox_param) - lr * l1
+            x[x < 0] = 0
+            param_out = np.sign(prox_param) * (x / (1.0 + lr * l2))
+        else:
+            param_out = prox_param / (1.0 + lr * l2)
+
+        self.outputs = {'ParamOut': param_out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_conv.py b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b305213df424dd097bf4238aa14320a2f7da45d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_conv.py
@@ -0,0 +1,92 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.nets as nets
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+
+images = layers.data(
+    name='pixel',
+    shape=[1, 28, 28],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+label = layers.data(
+    name='label',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+conv_pool_1 = nets.simple_img_conv_pool(
+    input=images,
+    filter_size=5,
+    num_filters=20,
+    pool_size=2,
+    pool_stride=2,
+    act="relu",
+    program=program,
+    init_program=init_program)
+conv_pool_2 = nets.simple_img_conv_pool(
+    input=conv_pool_1,
+    filter_size=5,
+    num_filters=50,
+    pool_size=2,
+    pool_stride=2,
+    act="relu",
+    program=program,
+    init_program=init_program)
+
+predict = layers.fc(input=conv_pool_2,
+                    size=10,
+                    act="softmax",
+                    program=program,
+                    init_program=init_program)
+cost = layers.cross_entropy(
+    input=predict, label=label, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 50
+PASS_NUM = 1
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+for pass_id in range(PASS_NUM):
+    count = 0
+    for data in train_reader():
+        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
+                                data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int32")
+        y_data = y_data.reshape([BATCH_SIZE, 1])
+
+        tensor_img = core.LoDTensor()
+        tensor_y = core.LoDTensor()
+        tensor_img.set(img_data, place)
+        tensor_y.set(y_data, place)
+
+        outs = exe.run(program,
+                       feed={"pixel": tensor_img,
+                             "label": tensor_y},
+                       fetch_list=[avg_cost])
+
+        loss = np.array(outs[0])
+
+        if loss < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
new file mode 100644
index 0000000000000000000000000000000000000000..a985d1f3d38fcaa8372a70edd519b873d47f554a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recognize_digits_mlp.py
@@ -0,0 +1,83 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+image = layers.data(
+    name='x',
+    shape=[784],
+    data_type='float32',
+    program=program,
+    init_program=init_program)
+
+hidden1 = layers.fc(input=image,
+                    size=128,
+                    act='relu',
+                    program=program,
+                    init_program=init_program)
+hidden2 = layers.fc(input=hidden1,
+                    size=64,
+                    act='relu',
+                    program=program,
+                    init_program=init_program)
+
+predict = layers.fc(input=hidden2,
+                    size=10,
+                    act='softmax',
+                    program=program,
+                    init_program=init_program)
+
+label = layers.data(
+    name='y',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+
+cost = layers.cross_entropy(
+    input=predict, label=label, program=program, init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 128
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.mnist.train(), buf_size=8192),
+    batch_size=BATCH_SIZE)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+        y_data = np.array(map(lambda x: x[1], data)).astype("int32")
+        y_data = np.expand_dims(y_data, axis=1)
+
+        tensor_x = core.LoDTensor()
+        tensor_x.set(x_data, place)
+
+        tensor_y = core.LoDTensor()
+        tensor_y.set(y_data, place)
+
+        outs = exe.run(program,
+                       feed={'x': tensor_x,
+                             'y': tensor_y},
+                       fetch_list=[avg_cost])
+        out = np.array(outs[0])
+        if out[0] < 5.0:
+            exit(0)  # if avg cost less than 5.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
index 92161ae5dd93d34d898a2027435cc5e55611bcd0..cc4008c0d8e73a3f7d9a9be2a4aacfd120ecd522 100644
--- a/python/paddle/v2/framework/tests/test_recurrent_op.py
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -16,14 +16,17 @@ class PySimpleRNN(object):
     '''
 
     def __init__(self, input_dim=30, batch_size=50, weight_dim=15, sent_len=11):
-        self.x = np.random.normal(size=(sent_len, batch_size, input_dim))
-        self.W = np.random.normal(size=(input_dim, input_dim))
-        self.U = np.random.normal(size=(input_dim, input_dim))
-        self.h_boot = np.random.normal(size=(batch_size, input_dim))
+        self.x = np.random.normal(size=(sent_len, batch_size,
+                                        input_dim)).astype("float32")
+        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
+        self.h_boot = np.random.normal(size=(batch_size,
+                                             input_dim)).astype("float32")
 
         # memories
         self.mems = [
-            np.zeros(shape=(batch_size, input_dim)) for i in range(sent_len)
+            np.zeros(shape=(batch_size, input_dim)).astype("float32")
+            for i in range(sent_len)
         ]
 
     def forward(self):
@@ -36,7 +39,7 @@ class PySimpleRNN(object):
         return [self.x[i] for i in range(self.x.shape[0])]
 
     def concat_outputs(self):
-        return np.array(self.mems)
+        return np.array(self.mems).astype("float32")
 
     def step(self, step_id, x):
         '''
@@ -47,8 +50,8 @@ class PySimpleRNN(object):
             pre_mem = self.mems[step_id - 1]
         else:
             pre_mem = self.h_boot
-        xW = np.matmul(x, self.W)
-        hU = np.matmul(pre_mem, self.U)
+        xW = np.matmul(x, self.W).astype("float32")
+        hU = np.matmul(pre_mem, self.U).astype("float32")
 
         sum = xW + hU
         self.mems[step_id] = py_sigmoid(sum)
@@ -63,7 +66,7 @@ class PySimpleRNNTest(unittest.TestCase):
 
 
 def create_tensor(scope, name, shape, np_data):
-    tensor = scope.new_var(name).get_tensor()
+    tensor = scope.var(name).get_tensor()
     tensor.set_dims(shape)
     tensor.set(np_data, core.CPUPlace())
     return tensor
@@ -102,7 +105,8 @@ class RecurrentOpTest(unittest.TestCase):
         self.create_step_net()
         ctx = core.DeviceContext.create(core.CPUPlace())
         self.rnnop.run(self.scope, ctx)
-        return np.array(self.scope.find_var("h@mem").get_tensor())
+        return np.array(self.scope.find_var("h@mem").get_tensor()).astype(
+            "float32")
 
     def create_global_variables(self):
         # create inlink
@@ -121,28 +125,28 @@ class RecurrentOpTest(unittest.TestCase):
         h_boot_np_data = self.py_rnn.h_boot
         create_tensor(self.scope, "h_boot", [self.batch_size, self.input_dim],
                       h_boot_np_data)
-        self.scope.new_var("step_scopes")
-        self.scope.new_var("h@mem")
+        self.scope.var("step_scopes")
+        self.scope.var("h@mem")
 
     def create_rnn_op(self):
         # create RNNOp
         self.rnnop = RecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
+            inputs=["x"],
+            initial_states=["h_boot"],
             step_net="stepnet",
             # outputs
-            outlinks=["h@mem"],
+            outputs=["h@mem"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@mem"])
+            ex_states=["h@pre"],
+            states=["h@mem"])
 
     def create_step_net(self):
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@mem")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
@@ -165,21 +169,21 @@ class RecurrentGradientOpTest(unittest.TestCase):
     def create_forward_op(self):
         self.forward_op = RecurrentOp(
             # inputs
-            inlinks=["x"],
-            boot_memories=["h_boot"],
+            inputs=["x"],
+            initial_states=["h_boot"],
             step_net="stepnet",
             # outputs
-            outlinks=["h"],
+            outputs=["h"],
             step_scopes="step_scopes",
             # attributes
-            pre_memories=["h@pre"],
-            memories=["h@alias"])
+            ex_states=["h@pre"],
+            states=["h@alias"])
 
         # create a stepnet for RNN
         stepnet = core.Net.create()
         x_fc_op = Operator("mul", X="x@alias", Y="W", Out="Wx")
         h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("add", X="Wx", Y="Uh", Out="sum")
+        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
         sig_op = Operator("sigmoid", X="sum", Y="h@alias")
 
         for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
@@ -197,7 +201,4 @@ class RecurrentGradientOpTest(unittest.TestCase):
 
 
 if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(yuyang18): InferShape has been removed, this unittest may error
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rmsprop_op.py b/python/paddle/v2/framework/tests/test_rmsprop_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..237bcfccceee89f62fc05e4c6c972a76d1875367
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rmsprop_op.py
@@ -0,0 +1,89 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestRmspropOp1(OpTest):
+    ''' Test RMSProp with explicit inputs
+    '''
+
+    def setUp(self):
+        self.op_type = "rmsprop"
+
+        param = np.random.random((123, 321)).astype("float32")
+        mean_square = np.random.random((123, 321)).astype("float32")
+        learning_rate = np.array([0.01]).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+
+        epsilon = 1e-6
+        decay = 0.9
+        momentum = 0.0
+
+        self.inputs = {
+            'Param': param,
+            'MeanSquare': mean_square,
+            'LearningRate': learning_rate,
+            'Grad': grad,
+            'Moment': moment,
+        }
+
+        self.attrs = {'epsilon': epsilon, 'decay': decay, 'momentum': momentum}
+
+        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        moment_out = momentum * moment + \
+            learning_rate * grad / np.sqrt(ms_out + epsilon)
+        param_out = param - moment_out
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'MeanSquareOut': ms_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestRmspropOp2(OpTest):
+    '''Test RMSProp with default values for attributes
+    '''
+
+    def setUp(self):
+        self.op_type = "rmsprop"
+
+        param = np.random.random((123, 321)).astype("float32")
+        mean_square = np.random.random((123, 321)).astype("float32")
+        learning_rate = np.array([0.01]).astype("float32")
+        grad = np.random.random((123, 321)).astype("float32")
+        moment = np.zeros((123, 321)).astype("float32")
+
+        epsilon = 1.0e-10
+        decay = 0.9
+        momentum = 0.0
+
+        self.inputs = {
+            'Param': param,
+            'MeanSquare': mean_square,
+            'LearningRate': learning_rate,
+            'Grad': grad,
+            'Moment': moment,
+        }
+
+        ms_out = decay * mean_square + (1 - decay) * grad * grad
+        moment_out = momentum * moment + \
+            learning_rate * grad / np.sqrt(ms_out + epsilon)
+        param_out = param - moment_out
+
+        self.outputs = {
+            'ParamOut': param_out,
+            'MomentOut': moment_out,
+            'MeanSquareOut': ms_out
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rnn_helpers.py b/python/paddle/v2/framework/tests/test_rnn_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..be0ecfb129aa181229bc42d8d6818ad860991965
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rnn_helpers.py
@@ -0,0 +1,38 @@
+import unittest
+from paddle.v2.framework.layers import *
+from paddle.v2.framework.framework import g_program
+
+
+class TestRNN(unittest.TestCase):
+    def test_rnn(self):
+        img = data(
+            shape=[
+                80,  # sequence length
+                22,  # image height
+                22
+            ],  # image width
+            data_type='float32',
+            name='image')
+        hidden = fc(input=img, size=100, act='sigmoid', num_flatten_dims=2)
+        self.assertEqual((-1, 80, 100), hidden.shape)
+        hidden = fc(input=hidden, size=100, act='sigmoid', num_flatten_dims=2)
+        self.assertEqual((-1, 80, 100), hidden.shape)
+
+        rnn = StaticRNN()
+        with rnn.step():
+            hidden = rnn.step_input(hidden)
+            self.assertEqual((-1, 100), hidden.shape)
+            memory = rnn.memory(shape=(-1, 32), dtype='float32', init_value=0.0)
+
+            rnn_out = fc(input=[hidden, memory], size=32, act='sigmoid')
+            self.assertEqual((-1, 32), rnn_out.shape)
+            rnn.update_memory(memory, rnn_out)
+            rnn.output(rnn_out)
+
+        out = rnn()
+        self.assertEqual((-1, 80, 32), out.shape)
+        print g_program
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
deleted file mode 100644
index 336645bd993ff743cbe20bb5cae5cd278db57ce7..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import unittest
-import numpy as np
-from op_test import OpTest
-
-
-class TestRowwiseAddOp(OpTest):
-    def setUp(self):
-        self.op_type = "rowwise_add"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [5, 10]).astype("float32"),
-            'b': np.random.uniform(0.1, 1, [10]).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'b'], 'Out')
-
-    def test_check_grad_ingore_b(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('b'))
-
-    def test_check_grad_ingore_x(self):
-        self.check_grad(['b'], 'Out', no_grad_set=set('X'))
-
-
-class TestRowwiseAddOp2(OpTest):
-    def setUp(self):
-        self.op_type = "rowwise_add"
-        self.inputs = {
-            'X': np.random.uniform(0.1, 1, [2, 3, 2, 5]).astype("float32"),
-            'b': np.random.uniform(0.1, 1, [2, 5]).astype("float32")
-        }
-        self.outputs = {'Out': np.add(self.inputs['X'], self.inputs['b'])}
-
-    def test_check_output(self):
-        self.check_output()
-
-    def test_check_grad_normal(self):
-        self.check_grad(['X', 'b'], 'Out')
-
-    def test_check_grad_ignore_b(self):
-        self.check_grad(['X'], 'Out', no_grad_set=set('b'))
-
-    def test_check_grad_ignore_x(self):
-        self.check_grad(['b'], 'Out', no_grad_set=set('X'))
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_save_restore_op.py b/python/paddle/v2/framework/tests/test_save_restore_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a36d03f62a7ad50f656e5c3fdb8c87548a120e8
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_save_restore_op.py
@@ -0,0 +1,71 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.framework as framework
+import paddle.v2.framework.executor as executor
+
+import numpy as np
+import unittest
+import os
+import sys
+import shutil
+
+FOLDER_PATH = "./tmp_test_dir"
+
+
+class TestSaveRestoreOp(unittest.TestCase):
+    def test_save_restore_op(self):
+        tensor_1_val = np.random.rand(3, 9).astype("float32")
+        tensor_2_val = np.random.randint(0, 20, size=(4, 2)).astype("int32")
+        place = core.CPUPlace()
+
+        program = framework.Program()
+        block = program.global_block()
+        v_a = block.create_var(
+            dtype="float32", shape=[3, 9], lod_level=0, name="tensor_1")
+        v_b = block.create_var(
+            dtype="int32", shape=[4, 2], lod_level=0, name="tensor_2")
+
+        t_1 = core.LoDTensor()
+        t_1.set(tensor_1_val, place)
+        t_2 = core.LoDTensor()
+        t_2.set(tensor_2_val, place)
+        block.append_op(
+            type="save",
+            inputs={"X": [v_a, v_b]},
+            attrs={"folderPath": FOLDER_PATH})
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": [v_a]},
+            attrs={"shape": [2, 2],
+                   "value": 0.0})
+        block.append_op(
+            type="fill_constant",
+            outputs={"Out": [v_b]},
+            attrs={"shape": [2, 2],
+                   "value": 0.0})
+        block.append_op(
+            type="restore",
+            outputs={"Out": [v_a, v_b]},
+            attrs={"folderPath": FOLDER_PATH})
+
+        if os.path.exists(FOLDER_PATH):
+            shutil.rmtree(FOLDER_PATH)
+        os.makedirs(FOLDER_PATH)
+
+        exe = executor.Executor(place)
+        out = exe.run(program,
+                      feed={"tensor_1": t_1,
+                            "tensor_2": t_2},
+                      fetch_list=[v_a, v_b])
+
+        self.assertTrue(os.path.isdir(FOLDER_PATH))
+        self.assertTrue(os.path.isfile(FOLDER_PATH + "/__tensor_1__"))
+        self.assertTrue(os.path.isfile(FOLDER_PATH + "/__tensor_2__"))
+
+        self.assertTrue(np.array_equal(np.array(out[0]), tensor_1_val))
+        self.assertTrue(np.array_equal(np.array(out[1]), tensor_2_val))
+
+        shutil.rmtree(FOLDER_PATH)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scatter_op.py b/python/paddle/v2/framework/tests/test_scatter_op.py
index 33c73c52631a09ea0fefdeb9467991ae9c04321c..1032269d5dfb02e3518b9ef2820d5d0dcc8a51a0 100644
--- a/python/paddle/v2/framework/tests/test_scatter_op.py
+++ b/python/paddle/v2/framework/tests/test_scatter_op.py
@@ -10,7 +10,7 @@ class TestScatterOp(OpTest):
         index_np = np.array([1, 2]).astype("int32")
         updates_np = np.random.random((2, 3)).astype("float32")
         output_np = np.copy(ref_np)
-        output_np[index_np] += updates_np
+        output_np[index_np] = updates_np
         self.inputs = {'Ref': ref_np, 'Index': index_np, 'Updates': updates_np}
         self.outputs = {'Out': output_np}
 
@@ -18,7 +18,7 @@ class TestScatterOp(OpTest):
         self.check_output()
 
     def test_check_grad(self):
-        self.check_grad(['Updates', 'Ref'], 'Out', in_place=True)
+        self.check_grad(['Updates'], 'Out', in_place=True)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
index 1ce9454067f91f39f01d9eb4c912857464a3c1cb..14743654792716e4a7ebce5238b142addc86337e 100644
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -18,7 +18,7 @@ class TestScope(unittest.TestCase):
     def test_create_var_get_var(self):
         paddle_c = paddle.v2.framework.core
         scope = paddle_c.Scope()
-        var_a = scope.new_var("var_a")
+        var_a = scope.var("var_a")
         self.assertIsNotNone(var_a)
         self.assertIsNotNone(scope.find_var('var_a'))
         scope2 = scope.new_scope()
@@ -27,7 +27,7 @@ class TestScope(unittest.TestCase):
     def test_var_get_int(self):
         paddle_c = paddle.v2.framework.core
         scope = paddle_c.Scope()
-        var = scope.new_var("test_int")
+        var = scope.var("test_int")
         var.set_int(10)
         self.assertTrue(var.is_int())
         self.assertEqual(10, var.get_int())
diff --git a/python/paddle/v2/framework/tests/test_selected_rows.py b/python/paddle/v2/framework/tests/test_selected_rows.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8a930cb08c42b48f678bdd7bdb7698923535d4f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_selected_rows.py
@@ -0,0 +1,38 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+
+
+class TestSelectedRows(unittest.TestCase):
+    def test_selected_rows(self):
+        place = core.CPUPlace()
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+        selected_rows = core.SelectedRows(rows, height)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+        tensor = selected_rows.get_tensor()
+        tensor.set(np_array, place)
+
+        # compare rows
+        self.assertEqual(0, selected_rows.rows()[0])
+        self.assertEqual(4, selected_rows.rows()[1])
+        self.assertEqual(7, selected_rows.rows()[2])
+
+        # compare height
+        self.assertEqual(10, selected_rows.height())
+
+        # compare tensor
+        self.assertAlmostEqual(2.0,
+                               selected_rows.get_tensor().get_float_element(0))
+        self.assertAlmostEqual(1.0,
+                               selected_rows.get_tensor().get_float_element(1))
+        self.assertAlmostEqual(
+            4.0,
+            selected_rows.get_tensor().get_float_element(2 * row_numel + 8))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_concat_op.py b/python/paddle/v2/framework/tests/test_seq_concat_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd2ebf0b21a953b76155eb04c57a7b65ac53cbc
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_seq_concat_op.py
@@ -0,0 +1,79 @@
+import unittest
+import numpy as np
+import sys
+from op_test import OpTest
+
+
+class TestConcatOp(OpTest):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((4, 8, 3)).astype('float32')
+        lod1 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        axis = 1
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        outs = []
+        for i in range(4):
+            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
+            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
+            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
+
+        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+
+    def setUp(self):
+        self.op_type = "sequence_concat"
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out')
+
+
+class TestConcatOpDiffLod(TestConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 6, 3)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((5, 6, 3)).astype('float32')
+        lod1 = [[0, 3, 5], [0, 1, 2, 3, 5]]
+        axis = 0
+        level = 1
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        outs = []
+        for i in range(4):
+            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
+            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
+            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
+
+        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+
+
+class TestConcatOpLevelZero(TestConcatOp):
+    def set_data(self):
+        # two level, batch size is 3
+        x0 = np.random.random((4, 3, 4)).astype('float32')
+        lod0 = [[0, 2, 4], [0, 1, 2, 3, 4]]
+        x1 = np.random.random((5, 3, 4)).astype('float32')
+        lod1 = [[0, 3, 5], [0, 1, 3, 4, 5]]
+        axis = 0
+        level = 0
+        self.inputs = {'X': [('x0', (x0, lod0)), ('x1', (x1, lod1))]}
+        self.attrs = {'axis': axis, 'level': level}
+        outs = []
+        for i in range(2):
+            sub_x0 = x0[lod0[level][i]:lod0[level][i + 1], :]
+            sub_x1 = x1[lod1[level][i]:lod1[level][i + 1], :]
+            outs.append(np.concatenate((sub_x0, sub_x1), axis=axis))
+
+        self.outputs = {'Out': np.concatenate(outs, axis=0)}
+
+
+if __name__ == '__main__':
+    sys.exit(0)
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_seq_pool.py b/python/paddle/v2/framework/tests/test_seq_pool.py
index 211086e5f4de32b996f0fa27c2eb52670c2b1e11..0ebf78bf8f02b4b2e5935e3177373b2d3ded7818 100644
--- a/python/paddle/v2/framework/tests/test_seq_pool.py
+++ b/python/paddle/v2/framework/tests/test_seq_pool.py
@@ -82,5 +82,70 @@ class TestSeqSumPool2D(TestSeqAvgPool2D):
             out[i] = np.reshape(sub_x.sum(axis=0), (3, 17))
 
 
+class TestSeqSqrtPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SQRT}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = sub_x.sum(axis=0) / np.sqrt(len)
+
+
+class TestSeqSqrtPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.SQRT}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            len = lod[0][i + 1] - lod[0][i]
+            out[i] = np.reshape(sub_x.sum(axis=0) / np.sqrt(len), (3, 17))
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.06)
+
+
+class TestSeqLastPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.LAST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[-1, :]
+
+
+class TestSeqLastPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.LAST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[-1, :], (3, 17))
+
+
+class TestSeqFirstPool(TestSeqAvgPool):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.FIRST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            out[i] = sub_x[0, :]
+
+
+class TestSeqFirstPool2D(TestSeqAvgPool2D):
+    def compute(self):
+        self.attrs = {'strategy': SeqPoolType.FIRST}
+        x, lod = self.inputs['X']
+        out = self.outputs['Out']
+        for i in range(4):
+            sub_x = np.reshape(x[lod[0][i]:lod[0][i + 1], :], (-1, 3 * 17))
+            out[i] = np.reshape(sub_x[0, :], (3, 17))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sequence_softmax_op.py b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..b54a56aa6d3f76baa4d1fc6ba8f963332deba002
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sequence_softmax_op.py
@@ -0,0 +1,38 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSequenceSoftmaxOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_softmax"
+        x = np.random.uniform(0.1, 1, (11, 1)).astype("float32")
+        lod = [[0, 4, 5, 8, 11]]
+
+        out = np.zeros((11, 1)).astype("float32")
+        for i in range(4):
+            sub_x = x[lod[0][i]:lod[0][i + 1], :]
+            sub_x = sub_x.reshape(1, lod[0][i + 1] - lod[0][i])
+            sub_out = stable_softmax(sub_x)
+            out[lod[0][i]:lod[0][i + 1], :] = sub_out.reshape(
+                lod[0][i + 1] - lod[0][i], 1)
+
+        self.inputs = {"X": (x, lod)}
+        self.outputs = {"Out": out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out", max_relative_error=0.01)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
index 64e54d1500c1bc134cc1efe33d41a16dbc08f2d4..01262bba4d43adaed179baef88ccab6e69b0884b 100644
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -1,5 +1,7 @@
 import unittest
 import numpy as np
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
 from op_test import OpTest
 
 
@@ -8,15 +10,79 @@ class TestSGDOp(OpTest):
         self.op_type = "sgd"
         w = np.random.random((102, 105)).astype("float32")
         g = np.random.random((102, 105)).astype("float32")
-        lr = 0.1
+        lr = np.array([0.1]).astype("float32")
 
-        self.inputs = {'param': w, 'grad': g}
-        self.attrs = {'learning_rate': lr}
-        self.outputs = {'param_out': w - lr * g}
+        self.inputs = {'Param': w, 'Grad': g, 'LearningRate': lr}
+        self.outputs = {'ParamOut': w - lr * g}
 
     def test_check_output(self):
         self.check_output()
 
 
+class TestSparseSGDOp(unittest.TestCase):
+    def check_with_place(self, place):
+        scope = core.Scope()
+
+        # create and initialize Grad Variable   
+        height = 10
+        rows = [0, 4, 7]
+        row_numel = 12
+
+        grad_selected_rows = scope.var('Grad').get_selected_rows()
+        grad_selected_rows.set_height(height)
+        grad_selected_rows.set_rows(rows)
+        np_array = np.ones((len(rows), row_numel)).astype("float32")
+        np_array[0, 0] = 2.0
+        np_array[2, 8] = 4.0
+
+        grad_tensor = grad_selected_rows.get_tensor()
+        grad_tensor.set(np_array, place)
+
+        # create and initialize Param Variable
+        param = scope.var('Param').get_tensor()
+        param_array = np.full((height, row_numel), 5.0).astype("float32")
+        param.set(param_array, place)
+
+        # create and initialize LeraningRate Variable
+        lr = scope.var('LearningRate').get_tensor()
+        lr_array = np.full((1), 2.0).astype("float32")
+        lr.set(lr_array, place)
+
+        # create and run sgd operator
+        sgd_op = Operator(
+            "sgd",
+            Param='Param',
+            Grad='Grad',
+            ParamOut='Param',
+            LearningRate='LearningRate')
+        ctx = core.DeviceContext.create(place)
+        sgd_op.run(scope, ctx)
+
+        # get and compare result
+        result_array = np.array(param)
+
+        # rows[0] = 0, 5.0 - 2.0 * 2.0
+        self.assertAlmostEqual(1.0, result_array[rows[0], 0])
+        # rows[0] = 0, 5.0 - 2.0 * 1.0
+        self.assertAlmostEqual(3.0, result_array[rows[0], 2])
+        # 5.0 - 2.0 * 0.0
+        self.assertAlmostEqual(5.0, result_array[1, 0])
+        # rows[1] = 4, 5.0 - 2.0 * 1.0
+        self.assertAlmostEqual(3.0, result_array[rows[1], 10])
+        # 5.0 - 2.0 * 0.0
+        self.assertAlmostEqual(5.0, result_array[5, 8])
+        # rows[2] = 7, 5.0 - 2.0 * 1.0
+        self.assertAlmostEqual(3.0, result_array[rows[2], 1])
+        # rows[2] = 7, 5.0 - 2.0 * 4.0
+        self.assertAlmostEqual(-3.0, result_array[rows[2], 8])
+
+    def test_sparse_sgd(self):
+        places = [core.CPUPlace()]
+        if core.is_compile_gpu():
+            places.append(core.GPUPlace(0))
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e53856b38aa5ddd6061b350a66e9fe86bc23923c
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -0,0 +1,66 @@
+import numpy as np
+from op_test import OpTest
+from scipy.special import logit
+from scipy.special import expit
+
+
+class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
+    '''Test sigmoid_cross_entropy_with_logit_op with binary labels
+    '''
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Labels': np.random.randint(0, 2, (batch_size, num_classes))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
+    '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels
+    '''
+
+    def setUp(self):
+        self.op_type = "sigmoid_cross_entropy_with_logits"
+        batch_size = 64
+        num_classes = 20
+        self.inputs = {
+            'X': logit(
+                np.random.uniform(0, 1, (batch_size, num_classes))
+                .astype("float32")),
+            'Labels': np.random.uniform(0, 1, (batch_size, num_classes))
+            .astype("float32")
+        }
+
+        # Fw Pass is implemented as elementwise sigmoid followed by
+        # elementwise logistic loss
+        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        sigmoid_X = expit(self.inputs['X'])
+        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        self.outputs = {'Out': -term1 - term2}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
diff --git a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
index 428395b76c8fbcbc07b19ee1979419f0e64aca85..05ba954c0b8655b92b12f9cc686ef048c4d84bbc 100644
--- a/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_with_cross_entropy_op.py
@@ -43,7 +43,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
     def setUp(self):
         self.op_type = "softmax_with_cross_entropy"
         batch_size = 2
-        class_num = 17
+        class_num = 37
 
         logits = np.random.uniform(0.1, 1.0,
                                    [batch_size, class_num]).astype("float32")
@@ -57,7 +57,7 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
 
         self.inputs = {"Logits": logits, "Label": labels}
         self.outputs = {"Softmax": softmax, "Loss": cross_entropy}
-        self.attrs = {"softLabel": True}
+        self.attrs = {"soft_label": True}
 
     def test_check_output(self):
         self.check_output()
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
index 8cd93b35d7d1cb7d3b4a19e0e402ef576f1c0982..e0cd2fa8aaf2db2991ad2b9a3053f0d00b509cd4 100644
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -6,7 +6,7 @@ import numpy
 class TestTensor(unittest.TestCase):
     def test_int_tensor(self):
         scope = core.Scope()
-        var = scope.new_var("test_tensor")
+        var = scope.var("test_tensor")
         place = core.CPUPlace()
 
         tensor = var.get_tensor()
@@ -25,7 +25,7 @@ class TestTensor(unittest.TestCase):
 
     def test_float_tensor(self):
         scope = core.Scope()
-        var = scope.new_var("test_tensor")
+        var = scope.var("test_tensor")
         place = core.CPUPlace()
 
         tensor = var.get_tensor()
@@ -46,7 +46,7 @@ class TestTensor(unittest.TestCase):
     def test_int_lod_tensor(self):
         place = core.CPUPlace()
         scope = core.Scope()
-        var_lod = scope.new_var("test_lod_tensor")
+        var_lod = scope.var("test_lod_tensor")
         lod_tensor = var_lod.get_tensor()
 
         lod_tensor.set_dims([4, 4, 6])
@@ -68,7 +68,7 @@ class TestTensor(unittest.TestCase):
     def test_float_lod_tensor(self):
         place = core.CPUPlace()
         scope = core.Scope()
-        var_lod = scope.new_var("test_lod_tensor")
+        var_lod = scope.var("test_lod_tensor")
 
         lod_tensor = var_lod.get_tensor()
         lod_tensor.set_dims([5, 2, 3, 4])
diff --git a/python/paddle/v2/framework/tests/test_tensor_array.py b/python/paddle/v2/framework/tests/test_tensor_array.py
new file mode 100644
index 0000000000000000000000000000000000000000..50b3e09162a24201ee45cbd017dfef8a60f0da78
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor_array.py
@@ -0,0 +1,106 @@
+import logging
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+
+
+class TestTensorArray(unittest.TestCase):
+    def setUp(self):
+        self.ta = core.TensorArray()
+
+        self.batch_size = 10
+        self.dim = 2
+
+        # create a LoDTensor
+        self.scope = core.Scope()
+        var = self.scope.var("test_tensor")
+        self.place = core.CPUPlace()
+        tensor = var.get_tensor()
+        tensor.set_dims([self.batch_size, self.dim])
+        tensor.alloc_float(self.place)
+        tensor_array = np.array(tensor)
+        tensor_array[0, 0] = 0
+        tensor_array[1, 0] = 1
+        tensor_array[2, 0] = 2
+        tensor_array[3, 0] = 3
+        tensor_array[4, 0] = 4
+        tensor_array[5, 0] = 5
+        tensor_array[6, 0] = 6
+        tensor_array[7, 0] = 7
+        tensor_array[8, 0] = 8
+        tensor_array[9, 0] = 9
+
+        lod_py = [[0, 2, 5, 10]]
+        lod_tensor = core.LoDTensor(lod_py)
+        lod_tensor.set(tensor_array, self.place)
+
+        self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
+
+        self.tensor = lod_tensor
+
+    def test_unstack(self):
+        self.ta.unstack(self.tensor)
+        self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
+
+    def test_read(self):
+        self.ta.unstack(self.tensor)
+        for i in range(self.batch_size):
+            tensor = self.ta.read(i)
+
+    def test_write(self):
+        self.ta.unstack(self.tensor)
+
+        # create a tensor with shape of [1, self.dim]
+        var = self.scope.var("hell")
+        tensor = var.get_tensor()
+        tensor.set_dims([1, self.dim])
+        tensor.alloc_float(self.place)
+        tensor_array = np.array(tensor)
+        for i in range(self.dim):
+            tensor_array[0, i] = i
+        tensor.set(tensor_array, self.place)
+
+        self.ta.write(2, tensor)
+
+        ta_tensor = self.ta.read(2)
+        ta_tensor_array = np.array(ta_tensor)
+        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
+        self.assertTrue((tensor_array == ta_tensor_array).all())
+
+    def test_write_shared(self):
+        self.ta.unstack(self.tensor)
+
+        # create a tensor with shape of [1, self.dim]
+        var = self.scope.var("hell")
+        tensor = var.get_tensor()
+        tensor.set_dims([1, self.dim])
+        tensor.alloc_float(self.place)
+        tensor_array = np.array(tensor)
+        for i in range(self.dim):
+            tensor_array[0, i] = i
+        tensor.set(tensor_array, self.place)
+
+        self.ta.write_shared(2, tensor)
+
+        ta_tensor = self.ta.read(2)
+        ta_tensor_array = np.array(ta_tensor)
+        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
+        self.assertTrue((tensor_array == ta_tensor_array).all())
+
+    def test_unpack(self):
+        meta = self.ta.unpack(self.tensor, 0, True)
+        self.assertEqual(self.ta.size(), 5)
+        self.assertEqual(meta, self.py_seq_meta)
+
+    def test_pack(self):
+        meta = self.ta.unpack(self.tensor, 0, True)
+        print "meta", meta
+        tensor = self.ta.pack(0, meta, self.tensor.lod())
+        print np.array(self.tensor)
+        print np.array(tensor)
+        self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
+        self.assertTrue(tensor.lod(), self.tensor.lod())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_uniform_random_op.py b/python/paddle/v2/framework/tests/test_uniform_random_op.py
index 30c59789d395b2b8d4b3019cf769c5bae029d91e..ded777105e0fc64eb82bf4013bfba7ba9d0ddefa 100644
--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
@@ -14,12 +14,12 @@ class TestUniformRandomOp(unittest.TestCase):
 
     def uniform_random_test(self, place):
         scope = core.Scope()
-        scope.new_var('X').get_tensor()
+        scope.var('X').get_tensor()
 
         op = Operator(
             "uniform_random",
             Out='X',
-            dims=[1000, 784],
+            shape=[1000, 784],
             min=-5.0,
             max=10.0,
             seed=10)
diff --git a/python/paddle/v2/framework/tests/test_variable.py b/python/paddle/v2/framework/tests/test_variable.py
new file mode 100644
index 0000000000000000000000000000000000000000..c670ca19afbd778747303cb002666aa2a5e62c37
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_variable.py
@@ -0,0 +1,48 @@
+import unittest
+from paddle.v2.framework.framework import Variable, g_program, Program
+import paddle.v2.framework.core as core
+import numpy as np
+
+
+class TestVariable(unittest.TestCase):
+    def test_np_dtype_convert(self):
+        DT = core.DataType
+        convert = Variable._convert_np_dtype_to_dtype_
+        self.assertEqual(DT.FP32, convert(np.float32))
+        self.assertEqual(DT.FP16, convert("float16"))
+        self.assertEqual(DT.FP64, convert("float64"))
+        self.assertEqual(DT.INT32, convert("int32"))
+        self.assertEqual(DT.INT16, convert("int16"))
+        self.assertEqual(DT.INT64, convert("int64"))
+        self.assertEqual(DT.BOOL, convert("bool"))
+        self.assertRaises(ValueError, lambda: convert("int8"))
+
+    def test_var(self):
+        b = g_program.current_block()
+        w = b.create_var(
+            dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
+        self.assertNotEqual(str(w), "")
+        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual((784, 100), w.shape)
+        self.assertEqual("fc.w", w.name)
+        self.assertEqual(0, w.lod_level)
+
+        w = b.create_var(name='fc.w')
+        self.assertEqual(core.DataType.FP64, w.data_type)
+        self.assertEqual((784, 100), w.shape)
+        self.assertEqual("fc.w", w.name)
+        self.assertEqual(0, w.lod_level)
+
+        self.assertRaises(ValueError,
+                          lambda: b.create_var(name="fc.w", shape=(24, 100)))
+
+    def test_step_scopes(self):
+        prog = Program()
+        b = prog.current_block()
+        var = b.create_var(
+            name='step_scopes', type=core.VarDesc.VarType.STEP_SCOPES)
+        self.assertEqual(core.VarDesc.VarType.STEP_SCOPES, var.type)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_word2vec.py b/python/paddle/v2/framework/tests/test_word2vec.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5d98035156c425ab97d2bf75f8f09c71884368f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_word2vec.py
@@ -0,0 +1,165 @@
+import paddle.v2 as paddle
+import paddle.v2.framework.layers as layers
+import paddle.v2.framework.core as core
+import paddle.v2.framework.optimizer as optimizer
+
+from paddle.v2.framework.framework import Program, g_program
+from paddle.v2.framework.executor import Executor
+
+import numpy as np
+
+init_program = Program()
+program = Program()
+
+embed_size = 32
+hidden_size = 256
+N = 5
+batch_size = 32
+
+word_dict = paddle.dataset.imikolov.build_dict()
+dict_size = len(word_dict)
+
+first_word = layers.data(
+    name='firstw',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+second_word = layers.data(
+    name='secondw',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+third_word = layers.data(
+    name='thirdw',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+forth_word = layers.data(
+    name='forthw',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+next_word = layers.data(
+    name='nextw',
+    shape=[1],
+    data_type='int32',
+    program=program,
+    init_program=init_program)
+
+embed_param_attr_1 = {
+    'name': 'shared_w',
+    'init_attr': {
+        'max': 1.0,
+        'type': 'uniform_random',
+        'min': -1.0
+    }
+}
+embed_param_attr_2 = {'name': 'shared_w'}
+
+embed_first = layers.embedding(
+    input=first_word,
+    size=[dict_size, embed_size],
+    data_type='float32',
+    param_attr=embed_param_attr_1,
+    program=program,
+    init_program=init_program)
+embed_second = layers.embedding(
+    input=second_word,
+    size=[dict_size, embed_size],
+    data_type='float32',
+    param_attr=embed_param_attr_2,
+    program=program,
+    init_program=init_program)
+
+embed_third = layers.embedding(
+    input=third_word,
+    size=[dict_size, embed_size],
+    data_type='float32',
+    param_attr=embed_param_attr_2,
+    program=program,
+    init_program=init_program)
+embed_forth = layers.embedding(
+    input=forth_word,
+    size=[dict_size, embed_size],
+    data_type='float32',
+    param_attr=embed_param_attr_2,
+    program=program,
+    init_program=init_program)
+
+concat_embed = layers.concat(
+    input=[embed_first, embed_second, embed_third, embed_forth],
+    axis=1,
+    program=program,
+    init_program=init_program)
+
+hidden1 = layers.fc(input=concat_embed,
+                    size=hidden_size,
+                    act='sigmoid',
+                    program=program,
+                    init_program=init_program)
+predict_word = layers.fc(input=hidden1,
+                         size=dict_size,
+                         act='softmax',
+                         program=program,
+                         init_program=init_program)
+cost = layers.cross_entropy(
+    input=predict_word,
+    label=next_word,
+    program=program,
+    init_program=init_program)
+avg_cost = layers.mean(x=cost, program=program, init_program=init_program)
+
+sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
+opts = sgd_optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+    paddle.dataset.imikolov.train(word_dict, N), batch_size)
+
+place = core.CPUPlace()
+exe = Executor(place)
+
+exe.run(init_program, feed={}, fetch_list=[])
+PASS_NUM = 100
+for pass_id in range(PASS_NUM):
+    for data in train_reader():
+        input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
+        input_data = map(lambda x: np.array(x).astype("int32"), input_data)
+        input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
+
+        first_data = input_data[0]
+        first_tensor = core.LoDTensor()
+        first_tensor.set(first_data, place)
+
+        second_data = input_data[0]
+        second_tensor = core.LoDTensor()
+        second_tensor.set(second_data, place)
+
+        third_data = input_data[0]
+        third_tensor = core.LoDTensor()
+        third_tensor.set(third_data, place)
+
+        forth_data = input_data[0]
+        forth_tensor = core.LoDTensor()
+        forth_tensor.set(forth_data, place)
+
+        next_data = input_data[0]
+        next_tensor = core.LoDTensor()
+        next_tensor.set(next_data, place)
+
+        outs = exe.run(program,
+                       feed={
+                           'firstw': first_tensor,
+                           'secondw': second_tensor,
+                           'thirdw': third_tensor,
+                           'forthw': forth_tensor,
+                           'nextw': next_tensor
+                       },
+                       fetch_list=[avg_cost])
+        out = np.array(outs[0])
+        if out[0] < 10.0:
+            exit(0)  # if avg cost less than 10.0, we think our code is good.
+exit(1)
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index e80456d9bbeb3c34ac9eab873a84dbf8f06e34df..9148cb56cf78e1ebb994f4a4a34d4a1b6e2e6ef4 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -96,6 +96,9 @@ class Inference(object):
             for i, item in enumerate(result):
                 retv[i].append(item)
 
+        if retv == None:
+            return []
+
         if flatten_result:
             retv = [numpy.concatenate(out) for out in retv]
 
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index 4cfd91882e2d5f0098d27b8897359152ddd94dda..bd97dc1199fedc8ac91c1c6086957e8cce88bdc4 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -101,6 +101,10 @@ class Parameters(object):
 
         self.__param_conf__[param_conf.name] = param_conf
 
+    def update_param_conf(self, model_config):
+        for p in model_config.parameters:
+            self.__param_conf__[p.name] = p
+
     def keys(self):
         """
         keys are the names of each parameter.
@@ -322,6 +326,17 @@ class Parameters(object):
         self.set(name, arr.reshape(self.get_shape(name)))
 
     def to_tar(self, f):
+        """
+        Save parameters to a tar file.
+
+        WARNING: You should use `paddle.v2.trainer.SGD.save_parameter_to_tar(f)`
+            to save parameters most of the time. Otherwise, some settings such
+            as model average will not take effect.
+
+        :param f:
+        :type f: file
+        :return:
+        """
         tar = tarfile.TarFile(fileobj=f, mode='w')
         for nm in self.names():
             buf = cStringIO.StringIO()
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
index b7791559594321a85f41b508b69efeb077d69595..b4333ed530ce464095ec38d72706949cc464fbe4 100644
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -5,3 +5,4 @@ py_test(test_topology SRCS test_topology.py)
 py_test(test_rnn_layer SRCS test_rnn_layer.py)
 py_test(test_parameters SRCS test_parameters.py)
 py_test(test_data_feeder SRCS test_data_feeder.py)
+py_test(test_paramconf_order SRCS test_paramconf_order.py)
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
index 83da678da387ed1c86868847f140c6c09fbec3b5..63905c04cf737d0f1d226a4a5a27777351dbf5a3 100644
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -97,7 +97,7 @@ class DataFeederTest(unittest.TestCase):
             each_sample.append(zip(a, b))
             data.append(each_sample)
 
-        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+        feeder = DataFeeder([('input', data_type.sparse_float_vector(dim))],
                             {'input': 0})
         arg = feeder(data)
         output = arg.getSlotValue(0)
diff --git a/python/paddle/v2/tests/test_paramconf_order.py b/python/paddle/v2/tests/test_paramconf_order.py
new file mode 100644
index 0000000000000000000000000000000000000000..41fea64122b81948d57cce07f00d764e4889da66
--- /dev/null
+++ b/python/paddle/v2/tests/test_paramconf_order.py
@@ -0,0 +1,85 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import math
+import paddle.v2 as paddle
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=5,
+        param_attr=paddle.attr.Param(
+            name="_proj", initial_std=0.001, learning_rate=1, l2_rate=0))
+    return wordemb
+
+
+def train():
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    # Every layer takes integer value of range [0, dict_size)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(name="fc1",
+                              input=contextemb,
+                              size=128,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(5 * 8),
+                                  learning_rate=1,
+                                  l2_rate=6e-4))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    return paddle.layer.classification_cost(input=predictword, label=nextword)
+
+
+class TestParamConfOrder(unittest.TestCase):
+    def test_param_conf_order(self):
+        paddle.init()
+        cost = train()
+        parameters = paddle.parameters.create(cost)
+        adagrad = paddle.optimizer.AdaGrad(
+            learning_rate=3e-3,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+
+        trainer = paddle.trainer.SGD(cost, parameters, adagrad)
+        for p in trainer.get_topology_proto().parameters:
+            if p.name == "_fc1.w0":
+                self.assertEqual(p.decay_rate, 6e-4)
+            else:
+                self.assertEqual(p.decay_rate, 8e-4)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index 2db66be2505dde38a501edf45984e1f36beb351d..923ccecb0bf1236b4a3768fdc07dc3027e2863b7 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -19,6 +19,7 @@ import paddle.trainer_config_helpers as conf_helps
 import layer as v2_layer
 import config_base
 import cPickle
+from paddle.trainer import config_parser as cp
 
 __all__ = ['Topology']
 
@@ -50,6 +51,35 @@ class Topology(object):
 
         assert isinstance(self.__model_config__, ModelConfig)
 
+    def update_from_default(self):
+        # HACK(typhoonzero): update ParameterConfig(proto) in case of
+        # optimizers are defined after layers, or between layers.
+        # Must be called from trainer.__init__()
+        for parameter in self.__model_config__.parameters:
+            if parameter.momentum == 0.0 and cp.g_default_momentum:
+                parameter.momentum = cp.g_default_momentum
+            if parameter.decay_rate == 0.0 and cp.g_default_decay_rate:
+                parameter.decay_rate = cp.g_default_decay_rate
+            if parameter.initial_mean == 0.0:
+                parameter.initial_mean = cp.g_default_initial_mean
+            if parameter.initial_std == 0.01:
+                parameter.initial_std = cp.g_default_initial_std
+            if parameter.initial_strategy == 0:
+                parameter.initial_strategy = cp.g_default_initial_strategy
+            if parameter.initial_smart == False:
+                parameter.initial_smart = cp.g_default_initial_smart
+            if parameter.num_batches_regularization == 1 and \
+                cp.g_default_num_batches_regularization:
+                parameter.num_batches_regularization = \
+                    cp.g_default_num_batches_regularization
+            if parameter.gradient_clipping_threshold == 0.0 and \
+                cp.g_default_gradient_clipping_threshold:
+                parameter.gradient_clipping_threshold = \
+                    cp.g_default_gradient_clipping_threshold
+            if parameter.device == -1 and cp.g_default_device:
+                parameter.device = cp.g_default_device
+            # FIXME(typhoonzero): ignored: update_hooks, g_default_compact_func
+
     def use_sparse_updater(self):
         """
         check if any parameter require to use sparse_update
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index ca95ef13bd440ac0ba3d46f6e4680d4d7aa94c42..b68fd0d5a97a7993ddd0a1d947304fa5428c01b8 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -64,6 +64,11 @@ class SGD(object):
                             "paddle.v2.optimizer.Optimizer")
         import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
+        # HACK(typhoonzero): update ParameterConfig(proto) in case of optimizers
+        # are defined after layers, or between layers.
+        topology.update_from_default()
+        parameters.update_param_conf(topology.proto())
+
         self.__optimizer__ = update_equation
         self.__topology__ = topology
         self.__parameters__ = parameters
@@ -91,6 +96,9 @@ class SGD(object):
         self.__parameters__.append_gradient_machine(gm)
         self.__parameter_updater__ = None
 
+    def get_topology_proto(self):
+        return self.__topology_in_proto__
+
     def __use_remote_sparse_updater__(self):
         return self.__use_sparse_updater__ and not self.__is_local__
 
@@ -164,11 +172,18 @@ class SGD(object):
                                                           pass_type)
                 self.__gradient_machine__.eval(pass_evaluator)
                 self.__gradient_machine__.eval(batch_evaluator)
+                event_handler(
+                    v2_event.EndForwardBackward(
+                        pass_id=pass_id,
+                        batch_id=batch_id,
+                        gm=self.__gradient_machine__))
                 for each_param in self.__gradient_machine__.getNonStaticParameters(
                 ):
                     self.__parameter_updater__.update(each_param)
                 cost_sum = out_args.sum()
                 cost = cost_sum / len(data_batch)
+                self.__parameter_updater__.finishBatch(cost)
+                batch_evaluator.finish()
                 event_handler(
                     v2_event.EndIteration(
                         pass_id=pass_id,
@@ -176,8 +191,6 @@ class SGD(object):
                         cost=cost,
                         evaluator=batch_evaluator,
                         gm=self.__gradient_machine__))
-                self.__parameter_updater__.finishBatch(cost)
-                batch_evaluator.finish()
 
             self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
diff --git a/v1_api_demo/README.md b/v1_api_demo/README.md
index 9442f76941287a710220f07cf7dbb29ebcadabdc..0460a85fae078800332982751a5d4a9644c50bd6 100644
--- a/v1_api_demo/README.md
+++ b/v1_api_demo/README.md
@@ -1,4 +1,4 @@
-The examples in v1_api_demo are using v1_api now, and will be upgraded into v2_api later.
+The examples in v1_api_demo are using v1_api currently, and will be upgraded to v2_api later.
 Thus, v1_api_demo is a temporary directory. We decide not to maintain it and will delete it in future.
 
 Please go to [PaddlePaddle/book](https://github.com/PaddlePaddle/book) and