diff --git a/.gitignore b/.gitignore
index 275173b9677bffe028152fe8eadb3384329aeb5a..c84b2fc8c79d6e2c9c83e2b830ab176295846fd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,9 @@ third_party/
 
 # clion workspace.
 cmake-build-*
+
+# generated while compiling
+python/paddle/v2/framework/core.so
+CMakeFiles
+cmake_install.cmake
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 61b989dc698798eca932516e558c63f107ef2754..2ca988c406ae2987e26ca37dbc17cc0a2af43743 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,14 +17,20 @@
     -   id: detect-private-key
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
--   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
-    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
+-   repo: local
     hooks:
-    -   id: clang-formater
--   repo: https://github.com/dnephin/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    -   id: clang-format
+        name: clang-format
+        description: Format files with ClangFormat.
+        entry: clang-format -i
+        language: system
+        files: \.(c|cc|cxx|cpp|h|hpp|hxx)$
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
-      -   id: go-fmt
-          files: (.*\.go)
-      -   id: go-lint
-          files: (.*\.go)
+    -   id: go-fmt
+        types:
+        - go
+    -   id: gometalinter
+        types:
+        - go
diff --git a/.travis.yml b/.travis.yml
index 498674469b27f585af798b95f30b74ebed99e32c..376c693602b56fe719decfeb41c217497e143e12 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@@ -11,6 +12,7 @@ os:
 env:
   - JOB=build_doc
   - JOB=check_style
+  - JOB=build_android
 addons:
   apt:
     packages:
@@ -39,6 +41,8 @@ before_install:
   - pip install rarfile
   - curl https://glide.sh/get | bash
   - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15a7c6b07417adfacd461e95c0b92f658e1e11cc..c7d743e193e7d32dbc0b56f3bcb05b6c61f85f1d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,7 +13,6 @@
 # limitations under the License
 
 cmake_minimum_required(VERSION 3.0)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
@@ -28,13 +27,17 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-find_package(Boost QUIET)
+if(NOT ANDROID)
+    find_package(Boost QUIET)
+endif()
 
 include(simd)
 
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -73,6 +76,10 @@ if(ANDROID)
         "Disable PYTHON when cross-compiling for Android" FORCE)
     set(WITH_RDMA OFF CACHE STRING
         "Disable RDMA when cross-compiling for Android" FORCE)
+    set(WITH_MKLDNN OFF CACHE STRING
+        "Disable MKLDNN when cross-compiling for Android" FORCE)
+    set(WITH_MKLML OFF CACHE STRING
+        "Disable MKLML package when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -86,6 +93,7 @@ endif()
 
 ########################################################################################
 
+include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -93,10 +101,12 @@ include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
+include(external/pybind11)    # download pybind11
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -133,12 +143,21 @@ if(WITH_GPU)
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
+if(WITH_MKLDNN)
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
+endif()
+
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
+# "add_subdirectory(go)" should be placed after the following loine,
+# because it depends on paddle/optimizer.
+add_subdirectory(paddle/optimizer)
+
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
@@ -146,7 +165,9 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 
 add_subdirectory(paddle)
-add_subdirectory(python)
+if(WITH_PYTHON)
+  add_subdirectory(python)
+endif()
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/Dockerfile b/Dockerfile
index ed5910d93b41dba8d50b2ba01c59c635797edd29..5dd9b0be4f7e0a304108abfdfb089fea4faa4d38 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,9 +25,9 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
-    python-numpy python-matplotlib gcc g++ \
+    python-numpy python-matplotlib gcc-4.8 g++-4.8 \
     automake locales clang-format-3.8 swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
diff --git a/Dockerfile.android b/Dockerfile.android
index fa24f6f06c4e76444c83bcf13fe312afdcb6c348..c0fa58c384f9ebcae60477ffce49ea4ffa929db9 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/README.md b/README.md
index fa16cc3cf2ef9c1200a19e03192c94c65fc08679..2a6beeb342b34f8e91ef509d7d41f286a666480c 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -61,35 +61,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
+[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
 
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
 
   You might want to start from the this online interactive book that can run in Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
+
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 913f711afff3b8f9f77b8da978a3b9e7165d0077..854066fd1d205c337fbdbe08997d88251095c799 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -15,23 +15,44 @@
 
 set(CBLAS_FOUND OFF)
 
-## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+## Find MKLML First.
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKLML)
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKLML_LIB})
+
+  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKLML "
+    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
+
+## Then find MKL.
+set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
+
+set(MKL_INCLUDE_SEARCH_PATHS
+  ${MKL_ROOT}/include
+  ${INTEL_MKL_ROOT}/include)
+set(MKL_LIB_SEARCH_PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64
+  ${INTEL_MKL_ROOT}/lib
+  ${INTEL_MKL_ROOT}/lib/intel64)
 
 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 
 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_FOUND ON)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index a4f98ec7d4af652d0dd0650f4906696ff3a4efb9..69220e03fe8e337205f31cb1f45e3e19ae4f5d1e 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -67,6 +67,30 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_MKLDNN)
+    add_definitions(-DPADDLE_USE_MKLDNN)
+    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+        set(OPENMP_FLAGS "-fopenmp")
+        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+    else()
+        find_package(OpenMP)
+        if(OPENMP_FOUND)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        else()
+            message(WARNING "Can not find OpenMP."
+                 "Some performance features in MKLDNN may not be available")
+        endif()
+    endif()
+
+endif(WITH_MKLDNN)
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
@@ -102,12 +126,19 @@ if(WITH_GOLANG)
       message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
     endif()
 
-    add_custom_target(go_vendor)
-    add_custom_command(TARGET go_vendor
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
       COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
       WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
-    add_dependencies(go_vendor go_path)
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
   endif()
 
 endif(WITH_GOLANG)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 6bbcd730e1b5ac49415cac676352e6df00eb6eb5..656e1a0803c6e389d70f37f592c3aa2e95a2bcd4 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
     .*cblas\\.h.*
     .*\\.pb\\.txt
     .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 
 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                 endif()
             endforeach()
             if(LINT MATCHES ON)
+                # cpplint code style
                 get_filename_component(base_filename ${filename} NAME)
                 set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
             endif()
         endforeach()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 9724c16122ab2e6be55864c8716698c9b9d7c3f0..5e3e437a8da9624df35a5c754fe77be73f20361d 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -106,6 +106,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
                 SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
             ENDIF()
         ENDIF()
+        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+        ENDIF()
         SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
     ENDIF()
 
@@ -162,6 +166,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ENDIF()
     ENDIF()
 
+    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+    ENDIF()
+
     STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
     STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
 
@@ -186,6 +194,10 @@ ELSE()
         SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
     ENDIF()
     SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 3e6cedbb0d718cfd4454f95dedf7e02a24f2981b..f7483f6be9169eb58f0148cd3a956a8c881e1fe3 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 ExternalProject_Add(
     extern_eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # for latest version, please get from official website
-    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
-    # URL_MD5        "1a47e78efe365a97de0c022d127607c3"
-
-    # for no-ssl http support, please get from bazel's mirror
-    # URL           "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
-    # URL_MD5       "4645c66075982da6fa0bcf6b20f3e8f7"
-
-    # get from github mirror
     GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "a46d2e7337c4656f00abe54a8115f6d76153a048"
+    GIT_TAG         "master"
     PREFIX          ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bd401faa6eb8a583bce542db68852f8571681daf..8a594a825abdca6a0f989b94fa42f97d6df5e10a 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog)
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 77e06e983e9f8bfaf6320e3c67b85b692ed877fc..e3970073a1a0b946fa1db6642799719d7a9fcf4f 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -34,9 +34,15 @@ IF(WITH_TESTING)
             "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
     ENDIF(WIN32)
 
+    IF(WITH_MKLML)
+        # wait for mklml downloading completed
+        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
+    ENDIF()
+
     ExternalProject_Add(
         extern_gtest
         ${EXTERNAL_PROJECT_LOG_ARGS}
+        DEPENDS         ${GTEST_DEPENDS}
         GIT_REPOSITORY  "https://github.com/google/googletest.git"
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..eff15de73f23db6dea3a7b79006bfec90d712ae5
--- /dev/null
+++ b/cmake/external/mkldnn.cmake
@@ -0,0 +1,72 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT        "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
+SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+IF(WIN32)
+    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+      "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF)
+    return()
+ELSE(WIN32)
+    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
+    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+
+IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
+    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
+    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+ENDIF()
+
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_DEPENDS}
+    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG             "v0.9"
+    PREFIX              ${MKLDNN_SOURCES_DIR}
+    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
+    BUILD_COMMAND       cd <SOURCE_DIR>/build
+                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
+                        && $(MAKE)
+    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
+    UPDATE_COMMAND      ""
+)
+
+ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
+LIST(APPEND external_project_dependencies mkldnn)
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3f940756a4abb79aba7d3561db19db8532a0b673
--- /dev/null
+++ b/cmake/external/mklml.cmake
@@ -0,0 +1,64 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLML})
+  return()
+ENDIF(NOT ${WITH_MKLML})
+
+INCLUDE(ExternalProject)
+
+SET(MKLML_PROJECT       "extern_mklml")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
+SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
+SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
+SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
+SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
+FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
+                              "cmake_minimum_required(VERSION 3.0)\n"
+                              "install(DIRECTORY ${MKLML_VER}\n"
+                              "        DESTINATION ${MKLML_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKLML_SOURCE_DIR}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
+                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} 
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+LIST(APPEND external_project_dependencies mklml)
diff --git a/paddle/function/nnpack/nnpack.cmake b/cmake/external/nnpack.cmake
similarity index 54%
rename from paddle/function/nnpack/nnpack.cmake
rename to cmake/external/nnpack.cmake
index 7182730ae8f133bdc4f73bfc46fa8acbe5f3b603..d42bcb0f329041462bd8b568052fbb8226d18e4e 100644
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/cmake/external/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
   set(NNPACK_FOUND ON)
   INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
   message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 5b9d9844ed21ceb507a8e01676c3533f4e3dd8fb..60a1041936437775e0994157b8ffcb7c52b7ab87 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND})
             # arm_soft_fp_abi branch of OpenBLAS to support softfp
             #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
             SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                SET(TARGET "ARMV7")
+            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                SET(TARGET "ARMV8")
+            ENDIF()
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
         ELSEIF(RPI)
             # use hardfp
             SET(OPENBLAS_COMMIT "v0.2.19")
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9391c285c7544669a5b1a078b7473d7a656c1bb4
--- /dev/null
+++ b/cmake/external/pybind11.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
+
+LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 6546b2c83bc8f81f89e4018a2216f191bbeb0d21..67a359d4b5f4cca8fc8e74eab4d4acb4cc12baed 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,6 +18,9 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
 IF(WITH_PYTHON)
     FIND_PACKAGE(PythonLibs 2.7)
+    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 ENDIF(WITH_PYTHON)
 
 SET(py_env "")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7a996dea92b13bdac054a987a004a3d54ff02da2..d00a9bb3a30cfb16623e073414088059481c3e1a 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,6 +9,11 @@ function(CheckCompilerCXX11Flag)
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
             message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
         endif()
+        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+        # Use Debug mode instead for now.
+        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
+            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        endif()
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
         # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
@@ -109,7 +114,9 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in Pybind11
+)
 
 set(GPU_COMMON_FLAGS
     -fPIC
@@ -122,6 +129,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
+    -Wno-error=array-bounds # Warnings in Eigen::array
 )
 
 if (APPLE)
@@ -150,7 +158,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 725cf28037182bfd0a27d49491bc8a479a9b4440..534be0abe246ac70950d85ad05441825c8ca768a 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -90,10 +90,11 @@
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-endif(NOT APPLE)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
@@ -103,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
   foreach(lib ${libs})
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
+  list(REMOVE_DUPLICATES libs_deps)
 
   if(APPLE) # Use OSX's libtool to merge archives
     # To produce a library we need at least one source file.
@@ -126,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
       # Get the file names of the libraries to be merged
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
   else() # general UNIX: use "ar" to extract objects and re-add to a common lib
@@ -144,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
 
       list(APPEND mergebases "${mergebase}")
     endforeach()
@@ -183,6 +185,10 @@ function(cc_library TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
+    
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+
   else(cc_library_SRCS)
     if (cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -284,8 +290,22 @@ function(go_library TARGET_NAME)
     set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
   endif()
 
-  # Add dummy code to support `make target_name` under Terminal Command
   set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # This custom command will always run since it depends on a not
+  # existing file.
+  add_custom_command(
+    OUTPUT dummy_rebulid_${TARGET_NAME}
+    COMMAND cmake -E touch ${dummyfile}
+    )
+  # Create a custom target that depends on the custom command output
+  # file, so the custom command can be referenced as a dependency by
+  # `add_dependencies`.
+  add_custom_target(rebuild_${TARGET_NAME}
+    DEPENDS dummy_rebulid_${TARGET_NAME}
+    )
+
+  # Add dummy code to support `make target_name` under Terminal Command
   file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
   if (go_library_SHARED OR go_library_shared)
     add_library(${TARGET_NAME} SHARED ${dummyfile})
@@ -296,11 +316,17 @@ function(go_library TARGET_NAME)
     add_dependencies(${TARGET_NAME} ${go_library_DEPS})
   endif(go_library_DEPS)
 
+  # The "source file" of the library is `${dummyfile}` which never
+  # change, so the target will never rebuild. Make the target depends
+  # on the custom command that touches the library "source file", so
+  # rebuild will always happen.
+  add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME})
+
   set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-  # FIXME: link path
+
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
@@ -308,7 +334,7 @@ function(go_library TARGET_NAME)
     -o "${${TARGET_NAME}_LIB_PATH}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   add_dependencies(${TARGET_NAME} go_vendor)
 endfunction(go_library)
 
@@ -319,14 +345,11 @@ function(go_binary TARGET_NAME)
   cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
-  # FIXME: link path
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
-      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  # TODO: don't know what ${TARGET_NAME}_link does
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
@@ -334,15 +357,18 @@ endfunction(go_binary)
 function(go_test TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_test_SRCS}
+    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_test(NAME ${TARGET_NAME}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
-  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
 function(proto_library TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 4f4a9187bcbe8ef902e923622552909808b121d6..372272a53c12c314fc80eebbce5eae9fcabc55ba 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -104,6 +104,11 @@ cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
     :noindex:
+
+row_l2_norm
+-----------
+..  autoclass:: paddle.v2.layer.row_l2_norm
+    :noindex:
     
 Recurrent Layers
 ================
@@ -198,6 +203,10 @@ identity_projection
 ..  autoclass:: paddle.v2.layer.identity_projection
     :noindex:
 
+slice_projection
+-------------------
+..  autoclass:: paddle.v2.layer.slice_projection
+    :noindex:
 
 table_projection
 ----------------
@@ -316,6 +325,11 @@ scaling
 ..  autoclass:: paddle.v2.layer.scaling
     :noindex:
 
+clip
+----
+..  autoclass:: paddle.v2.layer.clip
+    :noindex:
+
 slope_intercept
 ---------------
 ..  autoclass:: paddle.v2.layer.slope_intercept
@@ -474,6 +488,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
 Detection output Layer
 ======================
 
diff --git a/doc/design/cluster_train/save_model.md b/doc/design/cluster_train/save_model.md
index b70f00176b6701ef487ef88ac0933b9b227037ea..b755185c81ad617b9c85c47de0f5f65d2201c658 100644
--- a/doc/design/cluster_train/save_model.md
+++ b/doc/design/cluster_train/save_model.md
@@ -75,10 +75,11 @@ snapshot to a model will be a TODO for future.
 ### Trainer Election
 
 One trainer will be elected as the one to save the model. When using
-etcd, trainer ID is a randomly generated UUID, we will utilize etcd to
-elect one trainer. When not using etcd, unique trainer IDs will be
-given by the administrator, the trainer whose ID is "0" is elected to
-save the model.
+etcd, trainer ID is a randomly generated UUID, the trainer will
+contact the master server requesting to save the model, and find out
+if itself is elected. When the master server is not used, unique
+trainer IDs will be given by the administrator, the trainer whose ID
+is "0" is elected to save the model.
 
 ### Model Save Path
 
diff --git a/doc/design/scope.md b/doc/design/scope.md
index afe6bc028cafc5ee24b0041905857af58d3f5790..c9e0be716b606f6c7bf0373e0c6e632647e07a6f 100644
--- a/doc/design/scope.md
+++ b/doc/design/scope.md
@@ -37,8 +37,8 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
 ```cpp
 class Scope {
  public:
-  Variable* CreateVariable(const std::string& name);
-  const Variable* GetVariable(const std::string& name) const;
+  Variable* NewVar(const std::string& name);
+  const Variable* FindVar(const std::string& name) const;
 
  private:
     std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
@@ -58,12 +58,12 @@ class Scope {
  public:
   Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
 
-  Variable* GetVariable(const std::string& name) const {
+  Variable* FindVar(const std::string& name) const {
     auto it = vars_.find(name);
     if (it != vars_.end()) {
       return it->second.get();
     } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
+      return parent_->FindVar(name);
     } else {
       return nullptr;
     }
@@ -95,10 +95,10 @@ class Scope {
   static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
 
   // return nullptr if not found.
-  Variable* GetVariable(const std::string& name) const;
+  Variable* FindVar(const std::string& name) const;
 
   // return if already contains same name variable.
-  Variable* CreateVariable(const std::string& name);
+  Variable* NewVar(const std::string& name);
 
  private:
   std::shared_ptr<Scope> parent_;
@@ -107,11 +107,11 @@ class Scope {
 ```
 ## Only scope can create a variable
 
-To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `NewVar` can construct `Variable`.
 
 ## When scope destroyed, all variables inside this scope should be destroyed together
 
-The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+The scope hold unique pointers for all variables. User can `FindVar` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
 
 ## Sharing a parent scope
 
@@ -121,4 +121,4 @@ Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shar
 
 ## Orthogonal interface
 
-`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
+`FindVar` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `NewVar` will return a `Error` when there is a name conflict locally. Combine `FindVar` and `NewVar`, we can implement `NewVar` easily.
diff --git a/doc/design/simple_op_design.md b/doc/design/simple_op_design.md
index 49ca5db5da9e400fd2c54eb8903b0dd2eb832d44..5e07c29c56d21728599195d420d3222213d77e7c 100644
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@@ -49,6 +49,7 @@ message AttrProto {
 message VarProto {
 	required string name = 1;
 	required string comment = 2;
+	required bool is_tensor = 3;
 };
 
 message OpProto {
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index c14160d55ec8fdb9fc552da33f3a3dac13c1a764..138efb566e43fa71952f057829c2afbca96cadc9 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -311,3 +311,13 @@ Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异
 * 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
 主要的解决办法是减小学习律或者对数据进行归一化处理。
+
+15. 编译安装后执行 import paddle.v2 as paddle 报ImportError: No module named v2
+------------------------------------------------------------------------
+先查看一下是否曾经安装过paddle v1版本，有的话需要先卸载：
+
+pip uninstall py_paddle paddle
+
+然后安装paddle的python环境, 在build目录下执行
+
+pip install python/dist/paddle*.whl && pip install ../paddle/dist/py_paddle*.whl
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
index 9489a921c70ad6ee5709f46445554f5d9640162c..75037e693b32f923ee7dc9dfec322495fe4ce10a 100644
--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-假设 :math:`z = f(W^T x + b)` ，那么
+假设 :math:`z = W^T x + b` ，那么
 
 .. math::
 
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It
 
 where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
 
-The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. 
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
 
 Suppose our loss function is :math:`c(y)`, then
 
@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then
 
 .. math::
 
@@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
 Then, for fully connected layer, we need to compute:
 
 .. math::
-  
+
    \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
 
 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
                       /* weight */ true);
       }
     }
-    
+
 If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index d536f53abc031e9d279ace0e231a381a2f1e81b6..36e5d420c986fc8d88eefee4aa221dba0a0480f2 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使
 
     python -c "import py_paddle"
 
-如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
 
 如果提示正确，可以执行以下命令编译生成文档，即
@@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org文档
 ================================
 
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
 
 
 
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
index f00c70a0589a4f41a23164a95d505d4310d9157b..29ce909c6442014fa0b64c6ca018a61b92c840e9 100644
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
@@ -17,3 +17,7 @@ add_subdirectory(pserver/client/c)
 add_subdirectory(cmd/pserver)
 add_subdirectory(cmd/master)
 add_subdirectory(master/c)
+add_subdirectory(master)
+add_subdirectory(pserver)
+add_subdirectory(pserver/client)
+add_subdirectory(utils/networkhelper)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
index 1058ffa86b3f00b5e9525edca39a843da9b62db8..9e149967e71c9439bb00b973aa8723a809604aaf 100644
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(master SRC master.go DEPS paddle_go_optimizer)
+go_binary(master SRC master.go)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 54fa254863156455f66fa87de9077042a45f9735..739c4c01e02b10f46c36b997f8c4700150da2a26 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 import (
@@ -5,12 +19,15 @@ import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
 	"strings"
 	"time"
 
 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +37,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+
+	log.SetLevel(level)
+
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}
@@ -46,6 +70,20 @@ func main() {
 		store = &master.InMemStore{}
 	}
 
+	shutdown := func() {
+		log.Infoln("shutting down gracefully")
+		err := store.Shutdown()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}
+
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
 	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	if err != nil {
 		log.Fatal(err)
@@ -62,8 +100,12 @@ func main() {
 		log.Fatal(err)
 	}
 
-	err = http.Serve(l, nil)
-	if err != nil {
-		log.Fatal(err)
-	}
+	go func() {
+		err = http.Serve(l, nil)
+		if err != nil {
+			log.Fatal(err)
+		}
+	}()
+
+	<-c
 }
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 31ef450f032f756fb32a0444a7e94a18ec2918a0..f9cd8f87e8f2e715c87834ee08482be0f511f681 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -1,13 +1,30 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
 	"time"
 
 	"github.com/namsral/flag"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
@@ -18,49 +35,70 @@ func main() {
 	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
+	etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
 	level, err := log.ParseLevel(*logLevel)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	log.SetLevel(level)
 
 	var idx int
+
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
-		timeout := time.Second * time.Duration((*etcdTimeout))
-		e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
-		idx, err = e.Register()
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL)
+		idx, err = e.Register(*port)
+		candy.Must(err)
+
+		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
 		if err != nil {
-			panic(err)
+			if err == pserver.ErrCheckpointNotFound {
+				log.Infof("Could not find the pserver checkpoint.")
+			} else {
+				log.Errorf("Fetch checkpoint failed, %s", err)
+			}
 		}
 	}
 
-	s, err := pserver.NewService(idx)
-	if err != nil {
-		panic(err)
+	shutdown := func() {
+		log.Infoln("shutting down gracefully")
+		sErr := e.Shutdown()
+		if sErr != nil {
+			log.Errorln(sErr)
+		}
 	}
+
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
+	candy.Must(err)
+
 	err = rpc.Register(s)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
-	log.Infof("start pserver at port %d", *port)
-	err = http.Serve(l, nil)
+	go func() {
+		log.Infof("start pserver at port %d", *port)
+		err = http.Serve(l, nil)
+		candy.Must(err)
+	}()
 
-	if err != nil {
-		panic(err)
-	}
+	<-c
 }
diff --git a/go/connection/conn.go b/go/connection/conn.go
index 977e8cc123707dbcf055bb77399adbc232c575a0..ffa8db689da307277f0943a0a71c7ace5ab21887 100644
--- a/go/connection/conn.go
+++ b/go/connection/conn.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package connection
 
 import (
diff --git a/go/glide.lock b/go/glide.lock
index 190a222338b24b7edac76c72d07df0b2cbd7d9be..1f16abdf66422abcd0ab7987cab3499d02cf1b9c 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,15 +1,105 @@
-hash: b8f18ce6784bd3fadd9fed0b8443e7b658234ea785ae1f220723ae2c1f652aa7
-updated: 2017-06-27T14:05:48.925262819+08:00
+hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c
+updated: 2017-07-29T07:34:48.722757905+08:00
 imports:
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: 61fc123e7a8b14a0a258aa3f5c4159861b1ec2e7
+  version: c31bec0f29facff13f7c3e3d948e55dd6689ed42
   subpackages:
+  - alarm
+  - auth
   - auth/authpb
+  - client
   - clientv3
   - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
   - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
   - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
   - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
 - name: github.com/golang/protobuf
   version: 4bd1920723d7b7c925de087aa32e2187708897f7
   subpackages:
@@ -17,12 +107,61 @@ imports:
   - proto
 - name: github.com/golang/snappy
   version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
 - name: github.com/namsral/flag
   version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
 - name: github.com/PaddlePaddle/recordio
-  version: edfb82af0739c84f241c87390ec5649c7b28c129
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
 - name: github.com/sirupsen/logrus
-  version: 202f25545ea4cf9b191ff7f846df5d87c9382c2b
+  version: a3f95b5c423586578a4e099b11a46c2479628cac
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 1351f936d976c60a0a48d728281922cf63eafb8d
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -34,11 +173,15 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: f7928cfef4d09d1b080aa2b6fd3ca9ba1567c733
+  version: 0f826bdd13b500be0f1d4004938ad978fcc6031e
+  repo: https://github.com/golang/sys.git
+  vcs: git
   subpackages:
   - unix
 - name: golang.org/x/text
-  version: 4e9ab9ee170f2a39bd66c92b3e0a47ff47a4bc77
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
   subpackages:
   - secure/bidirule
   - transform
@@ -58,4 +201,23 @@ imports:
   - stats
   - tap
   - transport
-testImports: []
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/docker/docker
+  version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e
+  subpackages:
+  - pkg/ioutils
+  - pkg/longpath
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
index 05c5d15ca22b6a3d85bee8e1f31d222034ce5314..bc23fa6ebf2c3db61e2d63e5f7e7ddcb595dfed0 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -6,7 +6,19 @@ import:
   subpackages:
   - clientv3
   - clientv3/concurrency
+  - embed
+  - etcdserver
 - package: github.com/namsral/flag
   version: ^1.7.4-pre
 - package: github.com/sirupsen/logrus
   version: ^1.0.0
+- package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  vcs: git
+  repo: https://github.com/golang/crypto.git
+- package: golang.org/x/sys
+  vcs: git
+  repo: https://github.com/golang/sys.git
+- package: golang.org/x/text
+  vcs: git
+  repo: https://github.com/golang/text.git
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..93efa4eaf7da8d502a17ec617823d08195c5e9ee
--- /dev/null
+++ b/go/master/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(master_test)
+endif()
diff --git a/go/master/c/CMakeLists.txt b/go/master/c/CMakeLists.txt
index d900850be04e3f385cc7fbf341ef0bb9fe53e789..082d9f3f597db14d0731e0292d3b66d92a49d6c1 100644
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
@@ -1 +1,15 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 go_library(paddle_master SHARED DEPS paddle_go_optimizer)
diff --git a/go/master/c/client.go b/go/master/c/client.go
index 31f431197454c2ec6a25624d37b60876d99f0087..b5759c30b1d7f7dc33e162e959c7de165e02e1da 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -1,13 +1,29 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 /*
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-
 #define PADDLE_MASTER_OK    0
 #define PADDLE_MASTER_ERROR -1
 
+#define PADDLE_SAVE_MODEL_OK   1
+#define PADDLE_SAVE_MODEL_SKIP 0
+
 typedef int paddle_master_client;
 */
 import "C"
@@ -19,11 +35,9 @@ import (
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -52,32 +66,32 @@ func remove(client C.paddle_master_client) *master.Client {
 }
 
 //export paddle_new_etcd_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
 	p := C.GoString(etcdEndpoints)
-	cli, err := clientv3.New(clientv3.Config{
-		Endpoints:   strings.Split(p, ","),
-		DialTimeout: time.Second * time.Duration(timeout),
-	})
-	if err != nil {
-		panic(err)
-	}
-	ch := make(chan string, 1)
-	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
+	endpoints := strings.Split(p, ",")
+	c, err := master.NewClient(
+		master.WithEtcd(endpoints, time.Duration(timeout)*time.Second),
+		master.WithBuffer(bufSize),
+	)
 	if err != nil {
 		panic(err)
 	}
-	ch <- a
-	go master.WatchKey(cli, master.DefaultAddrPath, ch)
-	c := master.NewClient(ch, bufSize)
+
 	return add(c)
 }
 
 //export paddle_new_master_client
+//
+// bufSize is the record buffer size.
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	ch := make(chan string, 1)
-	ch <- a
-	c := master.NewClient(ch, bufSize)
+	c, err := master.NewClient(master.WithAddr(a), master.WithBuffer(bufSize))
+	if err != nil {
+		panic(err)
+	}
+
 	return add(c)
 }
 
@@ -86,6 +100,12 @@ func paddle_release_master_client(client C.paddle_master_client) {
 	remove(client)
 }
 
+//export paddle_start_get_records
+func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
+	c := get(client)
+	c.StartGetRecords(int(pass))
+}
+
 //export paddle_set_dataset
 func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
 	c := get(client)
@@ -104,23 +124,28 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }
 
-// return value:
-//     0:ok
-//    -1:error
+// paddle_next_record gets the nexts training record.
+//
+// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
+//
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
 	r, err := c.NextRecord()
 	if err != nil {
-		// Error
-		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		// NOTE: use errors to indicate pass ends
+		if err.Error() == master.ErrAllTaskFailed.Error() ||
+			err.Error() == master.ErrNoMoreAvailable.Error() ||
+			err.Error() == master.ErrPassBefore.Error() {
+			return -2
+		}
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}
 
@@ -130,6 +155,29 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	return C.int(size)
 }
 
+// paddle_request_save_model requests the master server to approve the
+// caller to save the model.
+//
+// returns 1 if the save the model request is approved, 0 if the
+// request is rejected because other trainer is saving the model, -1
+// if error happened.
+//
+//export paddle_request_save_model
+func paddle_request_save_model(client C.paddle_master_client, trainerID string, blockMS int) C.int {
+	c := get(client)
+	need, err := c.RequestSaveModel(trainerID, time.Duration(blockMS)*time.Millisecond)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+
+	if need {
+		return C.PADDLE_SAVE_MODEL_OK
+	}
+
+	return C.PADDLE_SAVE_MODEL_SKIP
+}
+
 //export mem_free
 func mem_free(p unsafe.Pointer) {
 	// "free" may be a better name for this function, but doing so
diff --git a/go/master/client.go b/go/master/client.go
index 05383f1bf40c0e2b1f974e802ee8fd6aac109b00..62801b9b7fe85fe27147b12160f48d988623d547 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -1,17 +1,34 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
 	"os"
+	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
 // Client is the client of the master server.
 type Client struct {
-	conn *connection.Conn
-	ch   chan record
+	conn    *connection.Conn
+	ch      chan record
+	bufSize int
 }
 
 type record struct {
@@ -19,33 +36,104 @@ type record struct {
 	err error
 }
 
-// NewClient creates a new Client.
+// WithBuffer sets the client to buffer the training record.
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addrCh <-chan string, bufSize int) *Client {
+func WithBuffer(bufSize int) func(*Client) error {
+	return func(c *Client) error {
+		if bufSize <= 0 {
+			return nil
+		}
+		c.bufSize = bufSize
+		return nil
+	}
+}
+
+// WithAddr sets the client to use fixed master address.
+func WithAddr(addr string) func(c *Client) error {
+	return func(c *Client) error {
+		ch := make(chan string, 1)
+		ch <- addr
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// WithEtcd sets the client to use etcd for master discovery.
+func WithEtcd(endpoints []string, timeout time.Duration) func(*Client) error {
+	return func(c *Client) error {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   endpoints,
+			DialTimeout: timeout,
+		})
+		if err != nil {
+			return err
+		}
+
+		ch := make(chan string, 1)
+		a, err := GetKey(cli, DefaultAddrPath, timeout)
+		if err != nil {
+			return err
+		}
+
+		if a != "" {
+			// Master is registered, send to the master address
+			// channel.
+			ch <- a
+		}
+
+		go watchKey(cli, DefaultAddrPath, ch)
+		go c.monitorMaster(ch)
+		return nil
+	}
+}
+
+// NewClient creates a new Client.
+func NewClient(opts ...func(*Client) error) (*Client, error) {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan record, bufSize)
-	go c.monitorMaster(addrCh)
-	go c.getRecords()
-	return c
+
+	for _, opt := range opts {
+		err := opt(c)
+		if err != nil {
+			return nil, err
+		}
+	}
+	c.ch = make(chan record, c.bufSize)
+	// FIXME: connection is created asyncrosly in monitorMaster go routine,
+	//        ensure the connection is ready for use before calling c.addClient.
+	time.Sleep(time.Second)
+	return c, nil
 }
 
-func (c *Client) getRecords() {
+// StartGetRecords must be called at beginning of each pass
+func (c *Client) StartGetRecords(passID int) {
+	go c.getRecords(passID)
+}
+
+func (c *Client) getRecords(passID int) {
 	for {
-		t, err := c.getTask()
+		t, err := c.getTask(passID)
 		if err != nil {
-			// TODO(helin): wait before move on with next
-			// getTask call.
-			log.Errorln(err)
-			continue
+			if err.Error() == ErrPassBefore.Error() ||
+				err.Error() == ErrNoMoreAvailable.Error() ||
+				err.Error() == ErrAllTaskFailed.Error() {
+				c.ch <- record{nil, err}
+				break
+			}
+			if err.Error() == ErrPassAfter.Error() {
+				// wait util last pass finishes
+				time.Sleep(time.Second * 3)
+				continue
+			}
+			log.Errorf("getTask error: %s", err)
 		}
 
 		for _, chunk := range t.Chunks {
-			f, err := os.Open(chunk.Path)
-			if err != nil {
-				log.Errorln(err)
+			f, e := os.Open(chunk.Path)
+			if e != nil {
+				log.Errorln(e)
 				continue
 			}
 
@@ -68,7 +156,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }
 
@@ -98,18 +189,21 @@ func (c *Client) monitorMaster(addrCh <-chan string) {
 	}
 }
 
-// SetDataset set dataset for the master server to dispatch.
+// SetDataset sets dataset to dispatch for the master server.
 //
-// SetDataset can be call multiple times from different nodes. But
-// only the first call will be honored.
+// SetDataset can be call multiple times at one pass. But only the first call
+// will be honored.
+//
+// After all tasks are done, another call of SetDataset will start another pass.
 func (c *Client) SetDataset(globPaths []string) error {
-	return c.conn.Call("Service.SetDataset", globPaths, nil)
+	err := c.conn.Call("Service.SetDataset", globPaths, nil)
+	return err
 }
 
 // getTask gets a new task from the master server.
-func (c *Client) getTask() (Task, error) {
+func (c *Client) getTask(passID int) (Task, error) {
 	var t Task
-	err := c.conn.Call("Service.GetTask", 0, &t)
+	err := c.conn.Call("Service.GetTask", passID, &t)
 	return t, err
 }
 
@@ -118,6 +212,11 @@ func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
 
+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFailed", meta, nil)
+}
+
 // NextRecord returns next record in the dataset.
 //
 // NextRecord will block until the next record is available. It is
@@ -126,3 +225,11 @@ func (c *Client) NextRecord() ([]byte, error) {
 	r := <-c.ch
 	return r.r, r.err
 }
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (c *Client) RequestSaveModel(trainerID string, blockDur time.Duration) (bool, error) {
+	var need bool
+	err := c.conn.Call("Service.RequestSaveModel", SaveModelRequest{TrainerID: trainerID, BlockDur: blockDur}, &need)
+	return need, err
+}
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 364dce7b58cf6366af711bde9107559a762563a4..d5f3d79464655540a29eaa6395057aa5795c4615 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
@@ -40,22 +54,22 @@ func TestGetFinishTask(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
-		if err != nil {
-			panic(err)
+		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		server := rpc.NewServer()
-		err = server.Register(s)
-		if err != nil {
-			panic(err)
+		sErr = server.Register(s)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		mux := http.NewServeMux()
 		mux.Handle(rpc.DefaultRPCPath, server)
-		err = http.Serve(l, mux)
-		if err != nil {
-			panic(err)
+		sErr = http.Serve(l, mux)
+		if sErr != nil {
+			panic(sErr)
 		}
 	}(l)
 
@@ -66,11 +80,21 @@ func TestGetFinishTask(t *testing.T) {
 
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,42 +103,56 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
-			task, err := c.getTask()
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			task, cErr := c.getTask(i)
+			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+				t.Fatalf("error: %v, pass: %d\n", cErr, i)
 			}
 			tasks = append(tasks, task)
 		}
 
-		_, err = c.getTask()
-		if err == nil {
+		// getting task before task finishes should return error
+		_, cErr := c.getTask(i)
+		if cErr == nil {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].ID)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		cErr = c.taskFinished(tasks[0].Meta.ID)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
+		// call taskFailed once won't put the task to failed queue, just ensure
+		// the call
+		cErr = c.taskFailed(tasks[0].Meta)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
+		}
+
 		tasks = tasks[1:]
-		task, err := c.getTask()
-		if err != nil {
-			t.Fatal(err)
+		_, cErr = c.getTask(i)
+		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
 		}
-		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.ID)
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			cErr = c.taskFinished(task.Meta.ID)
+			if cErr != nil {
+				t.Fatal(cErr)
 			}
 		}
 	}
 
 	for i := 0; i < 10; i++ {
+		// init pass data
+		c.StartGetRecords(i)
 		checkOnePass(i)
 	}
 }
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 6666d3860c412daa8711fbfa2d729a261b3fd887..79b9cc844d1ff938915a622bf19a7d772682becf 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master_test
 
 import (
@@ -6,8 +20,10 @@ import (
 	"net/http"
 	"net/rpc"
 	"os"
+	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -15,6 +31,18 @@ import (
 	"github.com/PaddlePaddle/recordio"
 )
 
+// tool function for testing output goroutine ids
+func goid() int {
+	var buf [64]byte
+	n := runtime.Stack(buf[:], false)
+	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
+	id, err := strconv.Atoi(idField)
+	if err != nil {
+		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
+	}
+	return id
+}
+
 func TestNextRecord(t *testing.T) {
 	const (
 		path  = "/tmp/master_client_TestFull"
@@ -31,7 +59,7 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
 		if err != nil {
 			panic(err)
 		}
@@ -55,32 +83,67 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	w := recordio.NewWriter(f, -1, -1)
+	w := recordio.NewWriter(f, 1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
-	curAddr := make(chan string, 1)
-	curAddr <- fmt.Sprintf(":%d", p)
-	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
-	for pass := 0; pass < 50; pass++ {
-		received := make(map[byte]bool)
-		for i := 0; i < total; i++ {
-			r, err := c.NextRecord()
-			if err != nil {
-				t.Fatal(pass, i, "Read error:", err)
-			}
 
-			if len(r) != 1 {
-				t.Fatal(pass, i, "Length should be 1.", r)
+	err = f.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	// start several client to test task fetching
+	var wg sync.WaitGroup
+	for i := 0; i < 4; i++ {
+		wg.Add(1)
+		// test for multiple concurrent clients
+		go func() {
+			defer wg.Done()
+			// each go-routine needs a single client connection instance
+			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
+			if e != nil {
+				t.Fatal(e)
 			}
+			e = c.SetDataset([]string{path})
+			if e != nil {
+				panic(e)
+			}
+			// test for n passes
+			for pass := 0; pass < 10; pass++ {
+				c.StartGetRecords(pass)
 
-			if received[r[0]] {
-				t.Fatal(pass, i, "Received duplicate.", received, r)
+				received := make(map[byte]bool)
+				taskid := 0
+				for {
+					r, e := c.NextRecord()
+					if e != nil {
+						// ErrorPassAfter will wait, else break for next pass
+						if e.Error() == master.ErrPassBefore.Error() ||
+							e.Error() == master.ErrNoMoreAvailable.Error() {
+							break
+						}
+						t.Fatal(pass, taskid, "Read error:", e)
+					}
+					if len(r) != 1 {
+						t.Fatal(pass, taskid, "Length should be 1.", r)
+					}
+					if received[r[0]] {
+						t.Fatal(pass, taskid, "Received duplicate.", received, r)
+					}
+					taskid++
+					received[r[0]] = true
+				}
 			}
-			received[r[0]] = true
-		}
+		}()
 	}
+	wg.Wait()
 }
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index 04c1394e963d1eb541b80b91407fb55b0d1e1f2a..94848d887e8bc4b055a7c8b89b9b7f26a39229d1 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
@@ -25,15 +39,12 @@ type EtcdClient struct {
 	statePath string
 	client    *clientv3.Client
 	lock      *concurrency.Mutex
+	sess      *concurrency.Session
 }
 
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
-	// store holds a etcd lock, even though the lock will expire
-	// when the lease timeout, we need to implement graceful
-	// shutdown to release the lock.
 	cli, err := clientv3.New(clientv3.Config{
 		Endpoints:   endpoints,
 		DialTimeout: dialTimeout,
@@ -53,14 +64,14 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	// one master running, but split-brain problem may cause
 	// multiple master servers running), and the cluster management
 	// software will kill one of them.
-	log.Debugf("Trying to acquire lock at %s.", lockPath)
+	log.Infof("Trying to acquire lock at %s.", lockPath)
 	err = lock.Lock(context.TODO())
 	if err != nil {
 		return nil, err
 	}
-	log.Debugf("Successfully acquired lock at %s.", lockPath)
+	log.Infof("Successfully acquired lock at %s.", lockPath)
 
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err
@@ -75,6 +86,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 		statePath: statePath,
 		client:    cli,
 		lock:      lock,
+		sess:      sess,
 	}
 
 	return e, nil
@@ -143,9 +155,24 @@ func (e *EtcdClient) Load() ([]byte, error) {
 	return state, nil
 }
 
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	err := e.sess.Close()
+	newErr := e.client.Close()
+	if newErr != nil {
+		if err == nil {
+			err = newErr
+		} else {
+			log.Errorln(newErr)
+		}
+	}
+
+	return err
+}
+
 // GetKey gets the value by the specify key.
-func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	resp, err := c.Get(ctx, key)
 	cancel()
 	if err != nil {
@@ -159,8 +186,8 @@ func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
 	return string(v), nil
 }
 
-// WatchKey watches the specify key and send to valChan if there is some event.
-func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+// watchKey watches the specify key and send to valChan if there is some event.
+func watchKey(c *clientv3.Client, key string, valChan chan<- string) {
 	rch := c.Watch(context.Background(), key)
 	for wresp := range rch {
 		for _, ev := range wresp.Events {
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
index bcd549b20e46381783bad11caa08cb7f4ba40add..a5bd2d4fe150cd34c699ccfae1f3d3e0fb2ef3d6 100644
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -1,10 +1,24 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import "sync"
 
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte
@@ -26,3 +40,8 @@ func (m *InMemStore) Load() ([]byte, error) {
 
 	return m.buf, nil
 }
+
+// Shutdown shuts down the in mem store.
+func (m *InMemStore) Shutdown() error {
+	return nil
+}
diff --git a/go/master/service.go b/go/master/service.go
index 58e68e744859933aa74cac231356d4ff9dfb8d7b..d30e9a33229c0aff354417771b5bf2ae6a781715 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import (
@@ -5,6 +19,7 @@ import (
 	"compress/gzip"
 	"encoding/gob"
 	"errors"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"sync"
@@ -19,10 +34,23 @@ const (
 	dialTimeout = 5 * time.Second
 )
 
+// ErrAllTaskFailed occur when tasks are in done or failed state.
+var ErrAllTaskFailed = errors.New("all task finished")
+
+// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
+var ErrNoMoreAvailable = errors.New("no more available task")
+
+// ErrPassBefore client side pass number does not match with master counter.
+var ErrPassBefore = errors.New("pass number smaller than master")
+
+// ErrPassAfter client side pass number does not match with master counter.
+var ErrPassAfter = errors.New("pass number larger than master")
+
 // Store is the interface for save and load the master state.
 type Store interface {
 	Save([]byte) error
 	Load() ([]byte, error)
+	Shutdown() error
 }
 
 // Chunk is a chunk of data consisted of several data instances.
@@ -31,40 +59,56 @@ type Chunk struct {
 	Index recordio.Index // chunk index
 }
 
+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
+
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
-	ID     int
+	Meta   TaskMeta
 	Chunks []Chunk
 }
 
 type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
+	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
 }
 
 type taskQueues struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
-	Failed  []Task
+	Failed  []taskEntry
 }
 
 // Service is the master server service.
 type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
-	timeoutMax    int
-	ready         chan struct{}
+	failureMax    int
 	store         Store
 
+	ready    chan struct{}
+	initDone bool
+
 	mu         sync.Mutex
-	initDone   bool
 	taskQueues taskQueues
+	currPass   int
+	jobTasks   []taskEntry
+
+	savingTrainer string
 }
 
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	id := 0
+	// generate uniq id across job using nanosecond + randint + counter
+	// FIXME(typhoonzero): this is a workaround, use uuid
+	randStart := rand.Int()
+	counter := 0
+	timestamp := time.Now().Nanosecond()
+	id := timestamp + randStart + counter
 	if chunksPerTask <= 0 {
 		chunksPerTask = 1
 	}
@@ -73,8 +117,9 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
-			id++
+			cur.Task.Meta.ID = id
+			counter++
+			id = timestamp + randStart + counter
 			result = append(result, cur)
 			cur.Task.Chunks = nil
 		}
@@ -83,7 +128,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}
 
 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}
 
@@ -91,11 +136,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
+	s.failureMax = failureMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -154,7 +199,7 @@ func (s *Service) recover() (bool, error) {
 
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -209,6 +254,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 
 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -225,7 +271,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 //
 // SetDataset can be call multiple times. But only the first call will
 // be honored.
-func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+func (s *Service) SetDataset(globPaths []string, _ *int) error {
 	if len(globPaths) == 0 {
 		return errors.New("no dataset specified")
 	}
@@ -244,19 +290,49 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 		return err
 	}
 
-	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+	s.jobTasks = partition(chunks, s.chunksPerTask)
+	s.taskQueues.Todo = s.jobTasks
 
 	err = s.snapshot()
 	if err != nil {
 		log.Errorln(err)
 		return err
 	}
-
 	close(s.ready)
 	s.initDone = true
 	return nil
 }
 
+// processFailedTask retry s.failureMax times for failed task.
+// return true if all task are done or failed.
+func (s *Service) processFailedTask(t taskEntry, epoch int) {
+	if t.Task.Meta.Epoch != epoch {
+		// new epoch, task launched after the
+		// schedule of this timeout check or failed status report.
+		return
+	}
+
+	defer func() {
+		err := s.snapshot()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}()
+
+	delete(s.taskQueues.Pending, t.Task.Meta.ID)
+
+	t.NumFailure++
+	if t.NumFailure > s.failureMax {
+		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		return
+	}
+
+	log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
+	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	return
+}
+
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 	return func() {
 		s.mu.Lock()
@@ -267,30 +343,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}
 
-		if t.Epoch != epoch {
-			// new epoch, task launched after the
-			// schedule of this timeout check.
-			return
-		}
-
-		defer func() {
-			err := s.snapshot()
-			if err != nil {
-				log.Errorln(err)
-			}
-		}()
-
-		delete(s.taskQueues.Pending, t.Task.ID)
-
-		t.NumTimeout++
-		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
-			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-			return
-		}
-
-		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
-		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+		s.processFailedTask(t, epoch)
 	}
 }
 
@@ -305,52 +358,45 @@ func (s *Service) logFields() log.Fields {
 }
 
 // GetTask gets a new task from the service.
-func (s *Service) GetTask(dummy int, task *Task) error {
+// passID is the client side pass count
+func (s *Service) GetTask(passID int, task *Task) error {
 	select {
 	case <-s.ready:
 	}
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if passID < s.currPass {
+		return ErrPassBefore
+	}
+	if passID > s.currPass {
+		// Client may get run to pass after master when one client faster than the
+		// other
+		return ErrPassAfter
+	}
 
 	if len(s.taskQueues.Todo) == 0 {
-		if len(s.taskQueues.Done) == 0 {
-			if len(s.taskQueues.Pending) == 0 {
-				err := errors.New("all task failed")
-				log.WithFields(s.logFields()).Warningln("All tasks failed.")
-				return err
-			}
-
-			// TODO(helin): client need to retry in this
-			// error case. Gotcha: RPC client can't
-			// compare returned error with predefined
-			// errors like io.EOF, because the error
-			// instance deserialized from RPC is a
-			// different instance than the error defined
-			// in package. So we need to figure out a way
-			// for client to check this error correctly.
-			err := errors.New("no more available task")
-			log.WithFields(s.logFields()).Warningln("No more available task.")
-			return err
+		if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 {
+			log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
+			return ErrAllTaskFailed
 		}
-		s.taskQueues.Todo = s.taskQueues.Done
-		s.taskQueues.Done = nil
-		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+		log.WithFields(s.logFields()).Warningln("No more available task.")
+		return ErrNoMoreAvailable
 	}
 
 	t := s.taskQueues.Todo[0]
-	t.Epoch++
+	t.Task.Meta.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
+	s.taskQueues.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
 
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
 
@@ -365,22 +411,24 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
-		return err
+		return nil
 	}
 
 	// task finished, reset timeout
-	t.NumTimeout = 0
+	t.NumFailure = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
 
 	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
-
-	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
-		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
-		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
-		s.taskQueues.Done = nil
+	if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 {
+		// increase master side pass count if all tasks finished
+		s.currPass++
+		s.taskQueues.Todo = s.jobTasks
+		s.taskQueues.Done = []taskEntry{}
+		// TODO(typhoonzero): deal with failed tasks
+		s.taskQueues.Failed = []taskEntry{}
+		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass)
 	}
 
 	err := s.snapshot()
@@ -389,3 +437,61 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 	return err
 }
+
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.taskQueues.Pending[meta.ID]
+	if !ok {
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
+		return nil
+	}
+
+	s.processFailedTask(t, meta.Epoch)
+	return nil
+}
+
+// SaveModelRequest is the request for saving model
+type SaveModelRequest struct {
+	TrainerID string
+	BlockDur  time.Duration
+}
+
+// RequestSaveModel requests the master server to approve the caller
+// to save the model.
+func (s *Service) RequestSaveModel(req SaveModelRequest, need *bool) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if req.TrainerID == "" {
+		return errors.New("trainer id is empty")
+	}
+
+	if s.savingTrainer == "" {
+		*need = true
+	} else {
+		if req.TrainerID == s.savingTrainer {
+			// save trainer asked to save model again
+			*need = true
+		} else {
+			*need = false
+		}
+	}
+
+	if *need {
+		s.savingTrainer = req.TrainerID
+		time.AfterFunc(req.BlockDur, func() {
+			s.mu.Lock()
+			s.savingTrainer = ""
+			s.mu.Unlock()
+		})
+	}
+
+	return nil
+}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index bc435b505c014ca13ed5fc16b33a21ebb089a3b7..bd1a939a55553b558181d91a757c487d0f97b40b 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master
 
 import "testing"
@@ -30,7 +44,8 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		// test auto increament ids
+		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/master/service_test.go b/go/master/service_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..5f91910ecc8cf32289e71e2e41e8b283acc115e6
--- /dev/null
+++ b/go/master/service_test.go
@@ -0,0 +1,68 @@
+package master_test
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/embed"
+	"github.com/docker/docker/pkg/ioutils"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewServiceWithEtcd(t *testing.T) {
+	// setup an embed etcd server
+	etcdDir, err := ioutils.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+	select {
+	case <-e.Server.ReadyNotify():
+		t.Log("Server is ready!")
+	case <-time.After(60 * time.Second):
+		e.Server.Stop() // trigger a shutdown
+		t.Fatal("Server took too long to start!")
+	}
+
+	ep := []string{"127.0.0.1:2379"}
+	masterAddr := "127.0.0.1:3306"
+	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = master.NewService(store, 10, 10, 3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   ep,
+		DialTimeout: 3 * time.Second,
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := cli.Close(); err != nil {
+		t.Fatal(err)
+	}
+	// test master process registry itself into etcd server.
+	assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.")
+}
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4fe0a8cb021e8dbf443c8f33bfb046e228a2fd8d
--- /dev/null
+++ b/go/pserver/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(pserver_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e295611060043a77bb1f19fc7053beddc9fbc327
--- /dev/null
+++ b/go/pserver/client/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(pserver_client_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4bf05c85386dfcef83453a663dffc5d62efcbcc0
--- /dev/null
+++ b/go/pserver/client/c/.gitignore
@@ -0,0 +1 @@
+libpaddle_go_optimizer.a
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index 93a0a27f858f8654e0a6114abe7e326b086b8bf9..a932791c7cb003dc812de4eed923e7c10b25c363 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,27 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 target_link_libraries(paddle_go_optimizer stdc++ m)
+
+# Copy library to the required place.
+# See: go/pserver/optimizer.go:
+# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+add_custom_command(TARGET paddle_go_optimizer POST_BUILD
+  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
+  )
+
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
   # FIXME: this test requires pserver which is not managed by the test
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 7ddaceb7ed33db32e19a191402100a0c0efa241a..14ad0774550f6e5a5d8610d6007904cd2820432c 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main
 
 /*
@@ -34,7 +48,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -42,10 +55,10 @@ var curHandle C.paddle_pserver_client
 func add(c *client.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
-	client := curHandle
+	cli := curHandle
 	curHandle++
-	handleMap[client] = c
-	return client
+	handleMap[cli] = c
+	return cli
 }
 
 func get(client C.paddle_pserver_client) *client.Client {
@@ -63,7 +76,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -101,11 +114,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
-	etcd_client := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }
 
@@ -114,30 +127,36 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 
+// paddle_begin_init_params tells trainer if it needs to init the
+// parameters.
+//
+// returns 1 if the trainer needs to init the parameters. 0 if the
+// trainer does not need to init the parameters.
+//
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return C.PSERVER_OK
+	return 0
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +172,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
 
@@ -223,12 +242,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
 
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR
@@ -243,17 +262,4 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 	return C.PSERVER_OK
 }
 
-//export paddle_save_model
-func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
-	p := C.GoString(path)
-	c := get(client)
-	err := c.Save(p)
-	if err != nil {
-		log.Errorln(err)
-		return C.PSERVER_ERROR
-	}
-
-	return C.PSERVER_OK
-}
-
 func main() {} // Required but ignored
diff --git a/go/pserver/client/c/test/CMakeLists.txt b/go/pserver/client/c/test/CMakeLists.txt
index dce8645ce753f6a14b298726c714be18de3834e4..3724ccb60b72bf446058c26c7ad3ee4d23f1ccce 100644
--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
@@ -1,2 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)
diff --git a/go/pserver/client/c/test/test_cclient.c b/go/pserver/client/c/test/test_cclient.c
index 8eababbe33914d25f1eb91b991e11eaacd2e4716..89c4d7f00aae2a92ae30ba7b4305550d150dd985 100644
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <stdio.h>
 #include <stdlib.h>
 
@@ -97,9 +111,5 @@ retry:
     getParams(c);
   }
 
-  if (paddle_save_model(c, "/tmp/")) {
-    fail();
-  }
-
   return 0;
 }
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 68e1d9b269209b695e27f91a656dc2d8e527b4cd..572a61e4ccaa9ef3d03a60d916e80eab907c6d88 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -1,5 +1,13 @@
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+from paddle.v2.reader.creator import cloud_reader
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoints = "http://" + etcd_ip + ":2379"
+print "etcd endpoints: ", etcd_endpoints
 
 
 def main():
@@ -9,30 +17,33 @@ def main():
     # network config
     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
     y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(name='w'),
+                                param_attr=paddle.attr.Param(
+                                    name='w', learning_rate=1e-3),
                                 size=1,
                                 act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(name='b'))
+                                bias_attr=paddle.attr.Param(
+                                    name='b', learning_rate=1e-3))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
     cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
     # create parameters
     parameters = paddle.parameters.create(cost)
 
-    # create optimizer
-    optimizer = paddle.optimizer.Momentum(momentum=0)
-
-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
+    # create optimizer of new remote updater to pserver
+    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
 
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer,
                                  is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoints,
+                                 use_etcd=True)
 
     # event_handler to print training and testing info
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
+            # FIXME: for cloud data reader, pass number is managed by master
+            # should print the server side pass number
             if event.batch_id % 100 == 0:
                 print "Pass %d, Batch %d, Cost %f" % (
                     event.pass_id, event.batch_id, event.cost)
@@ -47,10 +58,14 @@ def main():
                 print "Test %d, %.2f" % (event.pass_id, result.cost)
 
     # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
+                cloud_reader(
+                    ["/pfs/dlnel/public/dataset/uci_housing/uci_housing*"],
+                    etcd_endpoints),
+                buf_size=500),
             batch_size=2),
         feeding={'x': 0,
                  'y': 1},
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index aa8bfe30c26fcc0875ad479ecd562700ccefa5a3..15adda4735b022c16cb22715fb690b3740e58b76 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client
 
 import (
@@ -205,35 +219,9 @@ func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	return ps, nil
 }
 
-// Save indicates parameters to save the parameter to the given path.
-func (c *Client) Save(path string) error {
-	errCh := make(chan error, len(c.pservers))
-
-	for _, p := range c.pservers {
-		err := p.Call("Service.Save", path, nil)
-		errCh <- err
-	}
-
-	recv := 0
-	for err := range errCh {
-		if err != nil {
-			return err
-		}
-
-		recv++
-		if recv == len(c.pservers) {
-			break
-		}
-	}
-
-	// TODO(helin): there will be many files under path, need to
-	// merge them into a single file.
-	return nil
-}
-
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }
 
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 29b400812c9dc3a5f44700eacbf7ba043248f2f2..b630d434dca283df67f5b850b35057870fe27529 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -1,13 +1,29 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client_test
 
 import (
 	"context"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/rpc"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -42,7 +58,8 @@ func initClient() [numPserver]int {
 		ports[i] = p
 
 		go func(l net.Listener) {
-			s, err := pserver.NewService(0)
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, 1, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
@@ -76,15 +93,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 
 type selector bool
@@ -99,27 +134,34 @@ func (l lister) List() []client.Server {
 	return l
 }
 
-func ClientTest(t *testing.T, c *client.Client) {
+func testClient(t *testing.T, c *client.Client) {
 	selected := c.BeginInitParams()
 	if !selected {
 		t.Fatal("should be selected.")
 	}
 
-	const numParameter = 100
+	const numParameter = 1000
 	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
+
+	var wg sync.WaitGroup
 	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
-		p.Name = "p_" + strconv.Itoa(i)
-		p.ElementType = pserver.Float32
-		p.Content = make([]byte, (i+1)*100)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-		if err != nil {
-			t.Fatal(err)
-		}
+		wg.Add(1)
+		go func(i int) {
+			var p pserver.Parameter
+			p.Name = "p_" + strconv.Itoa(i)
+			p.ElementType = pserver.Float32
+			p.Content = make([]byte, (i+1)*100)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
 	}
+	wg.Wait()
 
 	err = c.FinishInitParams()
 	if err != nil {
@@ -127,7 +169,7 @@ func ClientTest(t *testing.T, c *client.Client) {
 	}
 
 	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
+	for i := 0; i < numParameter; i++ {
 		var g pserver.Gradient
 		g.Name = "p_" + strconv.Itoa(i)
 		g.ElementType = pserver.Float32
@@ -135,9 +177,31 @@ func ClientTest(t *testing.T, c *client.Client) {
 		grads = append(grads, g)
 	}
 
-	err = c.SendGrads(grads)
-	if err != nil {
-		t.Fatal(err)
+	const paramPerGroup = 10
+	const numGroups = numParameter / paramPerGroup
+
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err := c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
 	}
 
 	names := make([]string, numParameter)
@@ -145,20 +209,35 @@ func ClientTest(t *testing.T, c *client.Client) {
 		names[i] = "p_" + strconv.Itoa(i)
 	}
 
-	params, err := c.GetParams(names)
-	if err != nil {
-		t.Fatal(err)
-	}
+	for i := 0; i < numGroups; i++ {
+		var ns []string
+		if i == numGroups-1 {
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
 
-	if len(names) != len(params) {
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-	}
+		wg.Add(1)
+		go func(ns []string) {
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
 
-	for i := range params {
-		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
-		}
+			if len(ns) != len(params) {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+			}
+
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
 	}
+
+	wg.Wait()
 }
 
 func TestNativeClient(t *testing.T) {
@@ -168,13 +247,14 @@ func TestNativeClient(t *testing.T) {
 		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
 	}
 	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	ClientTest(t, c1)
+	testClient(t, c1)
 }
 
-// TODO: tmperary disable etcdClient test for dependency of etcd)
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
-	etcd_client := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
-	ClientTest(t, c2)
+	etcdClient := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
+	testClient(t, c2)
 }
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index 1fd3479aa88ccbbe7c5067da1e9886b65352e847..b6ff1fec8a6f37f61f38cb5d004b1d2c886473ed 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client
 
 import (
@@ -12,7 +26,7 @@ import (
 )
 
 const (
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
 )
 
 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -47,7 +61,7 @@ func (p *EtcdClient) Desired() int {
 
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -66,10 +80,10 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
+			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
 				time.Sleep(p.timeout)
@@ -106,11 +120,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -118,7 +132,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 37b8d522c1bd07acb41b9515a6d9bc15eae9aa32..4fb26307667295ab825d07be6c3d1d4b33f6eb8b 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
@@ -16,18 +30,23 @@ import (
 const (
 	// PsDesired is etcd path for store desired pserver count
 	PsDesired = "/ps_desired"
-	// PsAddr is the base dir for pserver to store their addr
+	// PsPath is the base dir for pserver to store their addr
 	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
+
+	retryTimeout = 5 * time.Second
 )
 
 // EtcdClient is the etcd client that the pserver uses for fault
 // tolerance, service registry and coordination.
 type EtcdClient struct {
-	numPservers   int
-	etcdEndpoints string
-	etcdClient    *clientv3.Client
-	// etcdTimeout is also used as retry intervals.
-	etcdTimeout time.Duration
+	numPservers int
+	endpoints   string
+	client      *clientv3.Client
+	sess        *concurrency.Session
+	dialTimeout time.Duration
+	ttlSec      int
 	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
 	externalIP string
 	// desired number of pservers in the job.
@@ -36,19 +55,19 @@ type EtcdClient struct {
 }
 
 // NewEtcdClient creates an EtcdClient
-func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
+func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
 	return &EtcdClient{
-		etcdTimeout:   timeout,
-		numPservers:   numPservers,
-		etcdEndpoints: endpoints,
+		dialTimeout: dialtimeout,
+		ttlSec:      ttlSec,
+		numPservers: numPservers,
+		endpoints:   endpoints,
 	}
 }
 
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
-
+func (e *EtcdClient) Register(port int) (int, error) {
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
 	if err != nil {
@@ -56,19 +75,26 @@ func (e *EtcdClient) Register() (int, error) {
 	}
 
 	// initialize connection to etcd.
-	ep := strings.Split(e.etcdEndpoints, ",")
+	ep := strings.Split(e.endpoints, ",")
 	for {
 		cli, err := clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: e.etcdTimeout,
+			DialTimeout: e.dialTimeout,
 		})
 		if err != nil {
 			log.Errorf("connect to etcd error: %v", err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
-		e.etcdClient = cli
-		log.Debugf("inited client to %s", e.etcdEndpoints)
+		e.client = cli
+		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
+		if err != nil {
+			log.Errorf("create etcd session error: %v", err)
+			time.Sleep(retryTimeout)
+			continue
+		}
+		e.sess = sess
+		log.Debugf("inited client to %s", e.endpoints)
 		break
 	}
 	// init /ps_desired using transaction, for multiple pservers may want to write
@@ -79,7 +105,7 @@ func (e *EtcdClient) Register() (int, error) {
 		cancel()
 		if err != nil {
 			log.Warn(err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		break
@@ -90,18 +116,18 @@ func (e *EtcdClient) Register() (int, error) {
 	// wait and set s.desired init value
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		resp, err := e.etcdClient.Get(ctx, PsDesired)
+		resp, err := e.client.Get(ctx, PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("getting %s error: %v", PsDesired, err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		if len(resp.Kvs) != 0 {
 			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 			if err != nil {
 				log.Errorf("value of %s invalid %v\n", PsDesired, err)
-				time.Sleep(e.etcdTimeout)
+				time.Sleep(retryTimeout)
 				// NOTE: wait util ps_desired value change
 				continue
 			}
@@ -114,11 +140,11 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		break
@@ -128,19 +154,19 @@ func (e *EtcdClient) Register() (int, error) {
 }
 
 func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+	return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
 		dsStr := c.Get(PsDesired)
 		if dsStr == "" {
-			c.Put(PsDesired, strconv.Itoa(numPservers))
+			c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
 		}
 		return nil
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 }
 
 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
-	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+	_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
 		registered := false
 		for i := 0; i < e.desired; i++ {
 			psKey := PsPath + strconv.Itoa(i)
@@ -149,35 +175,20 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 			log.Debugf("got value (%s) for key: %s", ps, psKey)
 
 			if ps == "" {
-				resp, err := e.etcdClient.Grant(context.TODO(), 5)
-				if err != nil {
-					log.Fatal(err)
-				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
-				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
-				if kaerr != nil {
-					log.Errorf("keepalive etcd node error: %v", kaerr)
-					return kaerr
-				}
-
-				// Eat the keep alive message so etcd
-				// will not expire the lease.
-				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
-					ka := <-ch
-					log.Debugf("keepalive: %d\n", ka.TTL)
-				}(ch)
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				log.Debug("register finished")
 				idx = i
 				registered = true
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 
 	if err != nil {
@@ -186,3 +197,47 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 
 	return idx, nil
 }
+
+// GetKey gets the value by the specified key
+func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := e.client.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return []byte{}, err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return []byte{}, nil
+	}
+	v := kvs[0].Value
+	return v, nil
+}
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	_, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
+	cancel()
+	return err
+}
+
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	var err error
+	if e.sess != nil {
+		err = e.sess.Close()
+	}
+
+	if e.client != nil {
+		newErr := e.client.Close()
+		if newErr != nil {
+			if err != nil {
+				log.Errorln(newErr)
+			} else {
+				err = newErr
+			}
+		}
+	}
+	return err
+}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 54d108209402c27e79a9948f60ecbdadeffc7d9b..709160d45d98b6cf6d60f52ceb3fb33e0a0bd17d 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,8 +1,21 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 // #cgo CFLAGS: -I ../../
-// //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
@@ -15,15 +28,14 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
+	contentLen  int
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -35,22 +47,31 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 	return (*[1 << 30]byte)(p)[:len:len]
 }
 
-func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
+	s := State
+	paramBufferSize := C.size_t(len(p.Content))
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
-		"ParamSize":   len(p.Content),
+		"ParamSize":   paramBufferSize,
 		"ConfigSize":  len(c),
+		"StateSize":   len(s),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(C.size_t(len(p.Content)))
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+	cbuffer = C.malloc(paramBufferSize)
+
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
+
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float),
-		(*C.char)(nullPtr), 0)
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
 }
 
@@ -60,12 +81,22 @@ func (o *optimizer) GetWeights() []byte {
 	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
 
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+}
+
 func (o *optimizer) UpdateParameter(g Gradient) error {
 	if o.elementType != g.ElementType {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}
 
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
+	}
+
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
@@ -73,8 +104,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index 0b2f4cfa41a630645c128ac13826de9d8b1d521b..d001e6993e6aed2f5829c1b86928af30f4900c8a 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
@@ -19,6 +33,6 @@ func TestOptimizerCreateRelease(t *testing.T) {
 		Param:  p,
 		Config: config,
 	}
-	o := newOptimizer(param)
+	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index ad16a5708d10bdcb5189a1e1e8abf13c54a72265..7d297c46d03bf78d18ca9830a318968397119d3e 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,22 +1,53 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver
 
 import (
+	"bufio"
+	"bytes"
+	"crypto/md5"
+	"encoding/gob"
+	"encoding/hex"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
 	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
 )
 
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
+// ErrCheckpointNotFound indicates that the pserver checkpoint could
+// not be found.
+var ErrCheckpointNotFound = errors.New("checkpoint not found")
+
+// RPC error message.
 const (
-	// AlreadyInitialized is true if pserver is initialized
-	AlreadyInitialized = "pserver already initialized"
-	// Uninitialized is true if pserver not fully initialized
-	Uninitialized = "pserver not fully initialized"
+	AlreadyInitialized  = "pserver already initialized"
+	Uninitialized       = "pserver not fully initialized"
+	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
 )
 
-// Supported element types
+// Supported element types.
 const (
 	Int32 ElementType = iota
 	UInt32
@@ -39,31 +70,101 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
+// checkpointMeta saves checkpoint metadata
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	MD5       string `json:"md5"`
+	Timestamp int64  `json:"timestamp"`
+}
+
+// Checkpoint is the pserver shard persist in file
+type Checkpoint []parameterCheckpoint
+
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
 
 // Service is the RPC service for pserver.
 type Service struct {
-	initialized chan struct{}
-	idx         int
+	initialized        chan struct{}
+	idx                int
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             *EtcdClient
+	mu                 sync.Mutex
+	optMap             map[string]*optimizer
+}
+
+// parameterCheckpoint saves parameter checkpoint
+type parameterCheckpoint struct {
+	ParameterWithConfig
+	State []byte
+}
+
+// NewCheckpointFromFile loads parameters and state from checkpoint file
+func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) {
+	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+	if err != nil {
+		return nil, err
+	}
+
+	if len(v) == 0 {
+		return nil, ErrCheckpointNotFound
+	}
+
+	var cpMeta checkpointMeta
+	if err = json.Unmarshal(v, &cpMeta); err != nil {
+		return nil, err
+	}
+
+	fn := filepath.Join(cpPath, cpMeta.UUID)
+	if _, err = os.Stat(fn); os.IsNotExist(err) {
+		return nil, err
+	}
+	content, err := ioutil.ReadFile(fn)
+	if err != nil {
+		return nil, err
+	}
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(content))
+	if md5 != cpMeta.MD5 {
+		return nil, errors.New(CheckpointMD5Failed)
+	}
 
-	mu     sync.Mutex
-	optMap map[string]*optimizer
+	dec := gob.NewDecoder(bytes.NewReader(content))
+	cp := Checkpoint{}
+	if err = dec.Decode(cp); err != nil {
+		return nil, err
+	}
+	return cp, nil
 }
 
 // NewService creates a new service, will bypass etcd registration if no
-// endpoints specified.
-func NewService(idx int) (*Service, error) {
+// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
+func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
-		idx: idx,
+		idx:                idx,
+		checkpointInterval: interval,
+		checkpointPath:     path,
+		client:             client,
 	}
 	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
+
+	if cp != nil {
+		for _, item := range cp {
+			p := ParameterWithConfig{
+				Param:  item.Param,
+				Config: item.Config,
+			}
+			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
+		}
+	}
 	return s, nil
 }
 
 // InitParam initializes a parameter.
-func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
+func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@@ -78,13 +179,13 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs)
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
 	return nil
 }
 
 // FinishInitParams tells the parameter server that the parameter
 // initialization has finished.
-func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
+func (s *Service) FinishInitParams(_ int, _ *int) error {
 	select {
 	case <-s.initialized:
 		return errors.New(AlreadyInitialized)
@@ -97,7 +198,7 @@ func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 
 // SendGrad sends gradient to parameter servers for parameter
 // optimization.
-func (s *Service) SendGrad(g Gradient, dummy *int) error {
+func (s *Service) SendGrad(g Gradient, _ *int) error {
 	select {
 	case <-s.initialized:
 	default:
@@ -132,17 +233,89 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
 	return nil
 }
 
-// Save tells the parameter server to save parameters.
-func (s *Service) Save(path string, dummy *int) error {
+// pserver save checkpoint
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
 
-	// TODO
-	return nil
+	cp := make([]parameterCheckpoint, len(s.optMap))
+	index := 0
+	for name, opt := range s.optMap {
+		var pc parameterCheckpoint
+		pc.Param.Name = name
+		pc.Param.ElementType = opt.elementType
+		pc.Param.Content = opt.GetWeights()
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err = encoder.Encode(cp)
+	if err != nil {
+		return
+	}
+
+	cpMeta := checkpointMeta{}
+	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
+	cpMeta.Timestamp = time.Now().UnixNano()
+	h := md5.New()
+	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
+
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
+	if err != nil {
+		return
+	}
+	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
+		log.Info("checkpoint does not exists.")
+	} else {
+		err = os.Remove(cpMeta.UUID)
+		if err != nil {
+			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
+		} else {
+			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		}
+	}
+	f, err := os.Create(cpMeta.UUID)
+	if err != nil {
+		return
+	}
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
+
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	if err != nil {
+		return
+	}
+
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+
+	return
 }
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index b6d20d2c8b7ba0ccd7ab46669a597a21dc11c381..988f3b5acb82a95aeb54af2b8b0e4d39a458291a 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver_test
 
 import (
@@ -15,7 +29,8 @@ const (
 )
 
 func TestServiceFull(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -30,7 +45,7 @@ func TestServiceFull(t *testing.T) {
 
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var p1 pserver.Parameter
@@ -39,40 +54,40 @@ func TestServiceFull(t *testing.T) {
 	p1.ElementType = pserver.Float32
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
 
 	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.SendGrad(g2, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	// don't compare content, since it's already changed by
@@ -81,36 +96,39 @@ func TestServiceFull(t *testing.T) {
 	p.Content = nil
 
 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }
 
 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
-		t.Error(err)
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err.Error() != pserver.AlreadyInitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -128,16 +146,6 @@ func TestBlockUntilInitialized(t *testing.T) {
 		ch <- struct{}{}
 	}()
 
-	wg.Add(1)
-	go func() {
-		err := s.Save("", nil)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
 	time.Sleep(50 * time.Millisecond)
 
 	select {
@@ -160,13 +168,17 @@ func TestBlockUntilInitialized(t *testing.T) {
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	wg.Wait()
 }
+
+func TestCheckpointSpeed(t *testing.T) {
+	//TODO(zhihong): test speed
+}
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9233264ff3c5eadaae2c432066281fb721e38773
--- /dev/null
+++ b/go/utils/networkhelper/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+if(WITH_TESTING)
+  go_test(network_helper_test)
+endif()
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
index fbeaea8f5e7d93309befbd23063e474a4c6df46e..c3fc747bdaf54c34d6d9841343d4b21f784e9a7b 100644
--- a/go/utils/networkhelper/helper.go
+++ b/go/utils/networkhelper/helper.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package networkhelper
 
 import (
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
index 4208f9e358fc4345b73a2b8a9211b8889c1190d8..0bc02ad42a9aad283957fd819b14f882359c25a7 100644
--- a/go/utils/networkhelper/helper_test.go
+++ b/go/utils/networkhelper/helper_test.go
@@ -1,3 +1,17 @@
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package networkhelper
 
 import "testing"
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 307e99bbe3a833f1fe26057ec38d0b96e04bc0fe..4b06966fba2bc9f92756be0cb8110bbcd5272423 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,13 +8,14 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
-add_subdirectory(optimizer)
 add_subdirectory(string)
 
 if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
+  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 2f45173bfd401ddda26d61ab7fcfe131d079f710..b6ff6ec7890c0b79d52a2f0784743289c7bc213f 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }
 
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
     void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
 
-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }
 
 std::string OptimizationConfig::toProtoString() {
   return m->getConfig().SerializeAsString();
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index 681e3a380912339c531c16c88f43255c2f34c32f..fcda6eaf031c02f2314298f88b3af2c08ba6fa11 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -37,7 +37,7 @@ std::vector<std::string> Evaluator::getNames() const {
 double Evaluator::getValue(const std::string name) const {
   paddle::Error err;
   double v = m->rawPtr->getValue(name, &err);
-  if (err) {
+  if (!err.isOK()) {
     throw std::runtime_error(err.msg());
   }
   return v;
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5fb3d1c73bc56e921f13aafd27c25224e259b3fe..0b9b83d42974151d49250bdf0e7c397f59bf6a62 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                                bool useSparseUpdater);
   static ParameterUpdater* createNewRemoteUpdater(
       OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21b851dd5e26c4752888067b20d0b1e16a4ab52d..120eea3f70125a57fb5ad685f2a11479bce12d0c 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
 
-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
     : m(new ParameterTraverseCallbackPrivate()) {}
 
-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
 
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                       const ParameterConfig& conf,
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 1aaefdfb8107a2eaa0432211fd7df4f5f12d537f..5934cb898b5f6adc74c237b1733a7459d8437a28 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
     OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
   auto updater = new ParameterUpdater();
   updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
   return updater;
 #else
   throw UnsupportError();
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index db8f005929d90f718fc1ad42c60b68108ff55005..500bc448c92630f4fc2f4df603c955e572d868ec 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 
 Vector::Vector() : m(new VectorPrivate()) {}
 
-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }
 
 Vector* Vector::createZero(size_t sz, bool useGpu) {
   auto retVec = new Vector();
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index f55197c8c9ebb4a0f67ab915abfefd6a45cd13aa..9f84db72da24b0e678520b077f9cba7ffc2d589a 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -17,73 +17,6 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-/**
- * @brief   Shrink column to feature.
- *
- * @param[in]   dataCol     expand data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataIm      output image data.
- * @param[in]   alpha
- * @param[in]   beta
- */
-extern void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha = 1.0f,
-                                  real beta = 0.0f);
-
-/**
- * @brief   Expand feature to column.
- *
- * @param[in]   dataIm      input image data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataCol     expand data.
- *
- */
-extern void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol);
-
 /**
  * @brief   Maximum pool forward.
  *
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 039551c6cc69525e71c8c311f78fb6dec07d7fed..2bbb9fa8dfd5eeac9d55aa67a28ebfbffa2acd46 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -17,36 +17,6 @@ limitations under the License. */
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha,
-                                  real beta) {}
-
-inline void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol) {}
-
 inline void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
                                const int channels,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b94f4d8fe4a251750c527d4b686fcc8f452d4606..b6e3e63a4f52261e49467bd82fdabd063e81460e 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -18,134 +18,6 @@ limitations under the License. */
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
 
-__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
-                              size_t blockH, size_t blockW, size_t width,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_col) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    size_t w_out = index % width_col;
-    index /= width_col;
-    size_t h_out = index % height_col;
-    size_t channel_in = index / height_col;
-    size_t channel_out = channel_in * blockH * blockW;
-    size_t h_in = h_out * strideH;
-    size_t w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (size_t i = 0; i < blockH; ++i) {
-      for (size_t j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-void hl_expand_feature2col(const real* dataIm, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataCol) {
-  size_t numKernels = channels * outputH * outputW;
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, height, dataIm, blockH, blockW, width,
-           strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataCol);
-  CHECK_SYNC("hl_expand_feature2col failed");
-}
-
-__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
-                              size_t width, size_t channels,
-                              size_t blockH, size_t blockW,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_im, real alpha, real beta) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    real val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-                          h*(width-2*paddingW) + w];
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
-    }
-  }
-}
-
-void hl_shrink_col2feature(const real * dataCol, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataIm, real alpha, real beta) {
-  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
-           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataIm, alpha, beta);
-  CHECK_SYNC("hl_shrink_col2feature failed");
-}
-
 __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int channels, const int height,
                                  const int width,
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index c53a5636829cab9d575f58cc2326cb3efe383e1c..7ad8a39768a064140a08c912a5a467bc24a12adf 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -1022,6 +1022,15 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
   real alpha = 1.0f;
   real beta = 1.0f;
   cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
+
+  int batch_size = ((cudnn_tensor_descriptor)inputDesc)->batch_size;
+  if (batch_size > 1024 && g_cudnn_lib_version < 6000) {
+    LOG(INFO) << " To process current batch data with size " << batch_size
+              << " (>1024), cudnnBatchNorm requires cuDNN version >= 6000."
+              << " If there is an error complaining CUDNN_STATUS_NOT_SUPPORTED,"
+              << " just recompile PaddlePaddle with cuDNN >= 6000, replacing"
+              << " current version " << g_cudnn_lib_version;
+  }
   CHECK_CUDNN(
       dynload::cudnnBatchNormalizationForwardInference(t_resource.cudnn_handle,
                                                        mode,
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 0fe2877f89f8d0fbc4db40c400037be30bb87ff7..eeee921db54e20ea6a017d2b83f2d7ca9e5e037e 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -269,8 +269,7 @@ void hl_sequence2batch_copy_padding(real* batch,
   int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
   dim3 threads(blockDimX, blockDimY);
 
-  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
-      CUDA_BLOCK_SIZE;
+  int gridDimX = (maxSequenceLength + blockDimY - 1) / blockDimY;
   int gridDimY = numSequences;
   dim3 grid(gridDimX, gridDimY);
 
@@ -330,7 +329,7 @@ __global__ void KeSequenceAvgForward(real* dst,
     }
     sum = mode == 1 ? sum :
         (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[gid] = sum;
+    dst[gid] += sum;
   }
 }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4409c6feae218222b7c0216760cebe4ae8e235cb..12a3a00bba35d476fca9c9fb47ac20b87e6f53f2 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,18 +1,38 @@
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+
 cc_test(variable_test SRCS variable_test.cc)
-cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
+
+cc_library(scope SRCS scope.cc)
+cc_test(scope_test SRCS scope_test.cc DEPS scope)
+
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
+
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor scope)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+
+cc_library(grad_op_builder SRCS grad_op_builder.cc DEPS op_proto operator)
+cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_builder)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+cc_test(grad_op_builder_test SRCS grad_op_builder_test.cc DEPS grad_op_builder op_registry add_op)
+
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+
+cc_library(net SRCS net.cc DEPS op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
+
+cc_library(backward SRCS backward.cc DEPS net)
+cc_test(backward_test SRCS backward_test.cc DEPS backward)
diff --git a/paddle/framework/attr_checker.h b/paddle/framework/attr_checker.h
index c0c33d81149ac2fc2a9a57d90931ef32375fe1d0..ea5614a45f3a77a851358aff80abbc276c9972ba 100644
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attr_checker.h
@@ -4,8 +4,9 @@
 #include <functional>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
-#include "paddle/framework/enforce.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -41,6 +42,35 @@ class DefaultValueSetter {
   T default_value_;
 };
 
+template <typename T>
+class EnumInContainer {
+ public:
+  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
+  void operator()(T& val) const {
+    PADDLE_ENFORCE(container_.find(val) != container_.end(),
+                   "Value %s is not in enum container %s", val,
+                   ContainerDebugString());
+  }
+
+ private:
+  std::string ContainerDebugString() const {
+    std::ostringstream sout;
+    sout << "[";
+    size_t cnt = 0;
+    for (auto& v : container_) {
+      sout << v;
+      ++cnt;
+      if (cnt != container_.size()) {
+        sout << " ,";
+      }
+    }
+    sout << "]";
+    return sout.str();
+  }
+
+  std::unordered_set<T> container_;
+};
+
 // check whether a certain attribute fit its limits
 // an attribute can have more than one limits
 template <typename T>
@@ -50,6 +80,11 @@ class TypedAttrChecker {
  public:
   TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
 
+  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
+    value_checkers_.push_back(EnumInContainer<T>(range));
+    return *this;
+  }
+
   TypedAttrChecker& LargerThan(const T& lower_bound) {
     value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
     return *this;
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0da11b91a7fe4a98e0832f70095c3200956ff001
--- /dev/null
+++ b/paddle/framework/backward.cc
@@ -0,0 +1,178 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+#include <list>
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+static bool AllInSet(const std::vector<std::string>& names,
+                     const std::string& suffix,
+                     const std::unordered_set<std::string>& set) {
+  for (auto& name : names) {
+    if (set.find(name + suffix) == set.end()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+static std::shared_ptr<OperatorBase> NOP() {
+  auto net_op = std::make_shared<NetOp>();
+  net_op->type_ = "@NOP@";
+  net_op->CompleteAddOp();
+  return net_op;
+}
+
+//  Get backward operator from a forward operator, recursively implementation.
+//
+//  no_grad_names the gradient variable names without gradient calculating.
+//
+//  uniq_id is a unique index used inside recursively calling BackwardRecursive.
+//  use `uid = uniq_id++;` to get the unique index, and pass `uniq_id` through
+//  recursive calling.
+//
+//  returns The backward operator. For simple situation, it is a simple
+//  operator. For complex situation, it is a NetOp.
+//
+//  See Backward.h for details
+static std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id);
+std::shared_ptr<OperatorBase> BackwardRecursive(
+    const OperatorBase& forwardOp,
+    std::unordered_set<std::string>& no_grad_names, size_t& uniq_id) {
+  //  If all input gradients of forwarding operator do not need to calculate,
+  //  just return an NOP. Not return null ptr because NOP does not take
+  //  too much time for calculation, but it is useful for simplifying logic.
+  if (AllInSet(forwardOp.inputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+               no_grad_names)) {
+    return NOP();
+  }
+
+  //  All output gradients of forwarding operator do not need to calculate. Then
+  //  all input gradients cannot be computed at all, and we put them into
+  //  `no_grad_names` set. Return an NOP.
+  if (AllInSet(forwardOp.outputs_, OperatorBase::GRAD_VAR_SUFFIX(),
+               no_grad_names)) {
+    for (auto& name : forwardOp.inputs_) {
+      // Mark all input is not need
+      no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+    }
+    return NOP();
+  }
+
+  // Returned gradient network
+  auto net = std::make_shared<NetOp>();
+
+  if (forwardOp.IsNetOp()) {
+    // Because forwardOp is a net op, it can static_cast.
+    auto& forwardNet = static_cast<const NetOp&>(forwardOp);
+
+    // Map from output gradient variable name to operator's indices in backward
+    // net. That operator generates that variable.
+    std::unordered_map<std::string, std::vector<size_t>> dup_output_ops;
+
+    size_t local_op_id = 0;
+    // reversely travel forwardNet
+    for (auto it = forwardNet.ops_.rbegin(); it != forwardNet.ops_.rend();
+         ++it, ++local_op_id) {
+      auto fwd = *it;
+      auto bwd = BackwardRecursive(*fwd, no_grad_names, uniq_id);
+      net->AddOp(bwd);
+      for (auto& out : bwd->outputs_) {
+        dup_output_ops[out].emplace_back(local_op_id);
+      }
+    }
+    // Get unique ID for this method.
+    auto uid = uniq_id++;
+    // TODO(dzh): more comment
+    using Pos = std::pair<size_t, std::shared_ptr<OperatorBase>>;
+    std::list<Pos> insert_position;
+    for (auto& dup_output_op : dup_output_ops) {
+      const std::string& name = dup_output_op.first;
+      auto& dup_op = dup_output_op.second;
+      if (dup_op.size() == 1) continue;
+      std::vector<std::string> dup_outputs;
+
+      for (size_t i = 0; i < dup_op.size(); ++i) {
+        auto op_offset = dup_op[i];
+        dup_outputs.push_back(name + "@RENAME@" + std::to_string(uid) + "@" +
+                              std::to_string(i));
+        net->ops_[op_offset]->Rename(name, dup_outputs.back());
+      }
+      insert_position.push_back(
+          {dup_op.back(),
+           OpRegistry::CreateOp(
+               "add", {dup_outputs}, {name},
+               {{"input_format",
+                 std::vector<int>{0, static_cast<int>(dup_outputs.size())}}})});
+    }
+
+    insert_position.sort(
+        [](const Pos& l, const Pos& r) { return l.first > r.first; });
+
+    for (auto& pos : insert_position) {
+      net->InsertOp(pos.first + 1, pos.second);
+    }
+
+  } else {
+    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
+    for (std::string& grad_input : grad_op->inputs_) {
+      if (no_grad_names.count(grad_input)) {
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - OperatorBase::GRAD_VAR_SUFFIX().size());
+        grad_input = prefix + OperatorBase::ZERO_VAR_SUFFIX();
+
+        // If part of input gradient of that operator is not calculated, fill
+        // zero variables to that input gradient.
+        net->AddOp(OpRegistry::CreateOp("fill_zeros_like", {prefix},
+                                        {grad_input}, {}));
+      }
+    }
+
+    for (std::string& grad_output : grad_op->outputs_) {
+      if (no_grad_names.count(grad_output)) {
+        grad_output = OperatorBase::EMPTY_VAR_NAME();
+      }
+    }
+
+    if (net->ops_.empty()) {  // Current no aux op is added to network
+      return grad_op;
+    }
+    net->AddOp(grad_op);
+  }
+  net->type_ = "@GENERATED_BACKWARD@";
+  net->CompleteAddOp();
+  return net;
+}
+
+// See header for comments
+std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars) {
+  std::unordered_set<std::string> no_grad_names;
+  no_grad_names.reserve(no_grad_vars.size());
+
+  for (auto& name : no_grad_vars) {
+    no_grad_names.insert(name + OperatorBase::GRAD_VAR_SUFFIX());
+  }
+  size_t uid = 0;
+  return BackwardRecursive(forwardOp, no_grad_names, uid);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.h b/paddle/framework/backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..c181919dc165cf0b49362f85e22ceb4131bbd387
--- /dev/null
+++ b/paddle/framework/backward.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <unordered_set>
+#include "operator.h"
+namespace paddle {
+namespace framework {
+
+// Create the backward operator from a forward operator.
+// TODO(yuyang18): Add more API reference comment.
+extern std::shared_ptr<OperatorBase> Backward(
+    const OperatorBase& forwardOp,
+    const std::unordered_set<std::string>& no_grad_vars);
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/backward.md b/paddle/framework/backward.md
new file mode 100644
index 0000000000000000000000000000000000000000..74c001b06a9e7b2279abf998604f2acf1b1168e4
--- /dev/null
+++ b/paddle/framework/backward.md
@@ -0,0 +1,38 @@
+## Operator/expression 's Backward
+
+### Motivation
+
+In Neural Network, the backpropagation algorithm follows the chain rule, so we need to compound the fundmental gradient operators/expressions together with chain rule . Every forward network need a backward network to construct the full computation lineage, the operator/ expression's Backward feature will generate the backward pass respect to forward pass.
+
+### Implement : gradient operator registry
+
+|                        | forward operator | backward operator                |
+| ---------------------- | ---------------- | -------------------------------- |
+| **Operator::inputs_**  | Inputs           | Inputs, Outputs, OutputGradients |
+| **Operator::outputs_** | Outputs          | InputGradients                   |
+
+Inputs/Outputs means the input/output of the operator,  InputGradients/OutputGradients is the gradient respect to forward opeartor. Forward operator and Backward operator are isomorphic, save their corresponding needs into member attribute.
+
+We use a global hash map record the gradient operators available, follow the philosophy  of minimum core, make operator pluggable unit. Each gradient is an operator and it needs to regist itself. 
+
+grad_op_builder(fengjiayi)
+
+### Implement : Backward network
+
+given a forward network, it generates the backward network. We only care about the Gradients—`OutputGradients`,`InputGradients`.
+
+1. bla bla bla (yuyang)
+
+2. NetOp 
+
+   when the input forward network is a NetOp, it need to call the sub NetOp/Operators backward function recursively and ensure them done. During the process, we need to collect the `OutputGradients` name.
+
+   We share variable in the same scope, as a result, duplicate operator `OutputGradients` will overwirte then duplicate variable.  
+
+   ![./images/duplicate_op]()
+
+    Share variable between operators or same input variable used in multiple operators lead to a duplicate gradient variable. As demo show above, we need to rename gradient name recursively, and add a generic add operator instead. 
+
+![./images/duplicate_op2]()
+
+​	Then collect the sub graph OutputGradients/InputGradients as the NetOp's and return it.
diff --git a/paddle/framework/backward_test.cc b/paddle/framework/backward_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b095c2c3d5dbf21b5ea70e17475a4aaad9b1db44
--- /dev/null
+++ b/paddle/framework/backward_test.cc
@@ -0,0 +1,389 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/backward.h"
+
+#include <gtest/gtest.h>
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+class EmptyOp : public OperatorBase {
+ public:
+  void InferShape(const Scope &scope) const override {}
+  void Run(const Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {}
+};
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input X of Add").IgnoreGradient();
+    AddInput("b", "Bias of Add").IgnoreGradient();
+    AddOutput("Out", "Out of Add").IgnoreGradient();
+    AddComment("Add Op");
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("A", "A");
+    AddInput("B", "B");
+    AddOutput("Out", "Out");
+    AddComment("Mul");
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X");
+    AddOutput("Y", "Y");
+    AddComment("Sigmoid");
+  }
+};
+
+class NoGradOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  NoGradOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "X input");
+    AddOutput("Y", "Y output");
+    AddComment("NoGradOp, same input output. no Grad");
+  }
+};
+
+class FcOp : public NetOp {
+ public:
+  void Init() override {
+    AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
+                               {Output("mul_result")}, {}));
+    auto b_name = Input("b");
+    std::string before_act = "mul_result";
+    if (b_name != EMPTY_VAR_NAME()) {
+      AddOp(OpRegistry::CreateOp("rowwise_add", {Output("mul_result"), b_name},
+                                 {Output("add_result")}, {}));
+      before_act = "add_result";
+    } else {
+      auto out_varname = Output("add_result");
+      if (out_varname != EMPTY_VAR_NAME()) {
+        this->Rename(out_varname, EMPTY_VAR_NAME());
+      }
+    }
+
+    AddOp(OpRegistry::CreateOp("sigmoid", {Output(before_act)}, {Output("Out")},
+                               {}));
+    CompleteAddOp(false);
+  }
+};
+
+class FcOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FcOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x");
+    AddInput("W", "w");
+    AddInput("b", "b");
+    AddOutput("mul_result", "").SetTemporary();
+    AddOutput("add_result", "").SetTemporary();
+    AddOutput("Out", "");
+    AddComment("");
+  }
+};
+
+class ManyOutputOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  ManyOutputOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("y", "y");
+    AddOutput("z", "z");
+    AddComment("");
+  }
+};
+
+class FillZeroOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  FillZeroOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "x");
+    AddOutput("out", "out");
+    AddComment("");
+  }
+};
+
+class AddOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "x").SetMultiple();
+    AddOutput("Y", "y");
+    AddComment("");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+namespace f = paddle::framework;
+using EnforceNotMet = paddle::platform::EnforceNotMet;
+REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
+REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
+REGISTER_OP(mul, f::EmptyOp, f::MulOpMaker);
+REGISTER_GRADIENT_OP(mul, mul_grad, f::EmptyOp);
+REGISTER_OP(sigmoid, f::EmptyOp, f::SigmoidOpMaker);
+REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, f::EmptyOp);
+REGISTER_OP(nograd, f::EmptyOp, f::NoGradOpMaker);
+REGISTER_OP(fill_zeros_like, f::EmptyOp, f::FillZeroOpMaker);
+REGISTER_OP(add, f::EmptyOp, f::AddOpMaker);
+REGISTER_GRADIENT_OP(add, add_grad, f::EmptyOp);
+REGISTER_OP(fc, f::FcOp, f::FcOpMaker);
+REGISTER_OP(many_output_op, f::EmptyOp, f::ManyOutputOpMaker);
+REGISTER_GRADIENT_OP(many_output_op, many_output_op_grad, f::EmptyOp);
+
+TEST(Backward, simple_op_grad) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::OpRegistry::CreateGradOp(*fwd);
+  ASSERT_EQ(1UL, gop->inputs_.size());
+  ASSERT_EQ("Out" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->inputs_[0]);
+  ASSERT_EQ("rowwise_add_grad", gop->type_);
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[0]);
+  ASSERT_EQ("b" + f::OperatorBase::GRAD_VAR_SUFFIX(), gop->outputs_[1]);
+
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            gop->Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, simple_op_not_need_grad) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  auto gop = f::Backward(*fwd, {"X"});
+  ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
+                      "X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            gop->outputs_.end());
+
+  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
+  ASSERT_NE(no_input_gop, nullptr);
+  ASSERT_TRUE(no_input_gop->IsNetOp());
+  ASSERT_EQ(0UL, std::static_pointer_cast<f::NetOp>(no_input_gop)->ops_.size());
+}
+
+TEST(Backward, net_fc_backward_normal) {
+  std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "w", "b"}, {"mul_result", "add_result", "out"}, {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<f::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(3UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
+
+  f::OperatorBase &d_add = *net->ops_[1];
+  ASSERT_EQ("rowwise_add_grad", d_add.type_);
+
+  f::OperatorBase &d_mul = *net->ops_[2];
+  ASSERT_EQ("mul_grad", d_mul.type_);
+}
+
+TEST(Backward, net_fc_backward_not_have_b) {
+  std::shared_ptr<f::OperatorBase> fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "w", f::OperatorBase::EMPTY_VAR_NAME()},
+      {"mul_result", "add_result", "tmp"}, {});
+  ASSERT_NE(fwd, nullptr);
+  std::shared_ptr<f::OperatorBase> gop = f::Backward(*fwd, {});
+  ASSERT_TRUE(gop->IsNetOp());
+  auto net = static_cast<f::NetOp *>(gop.get());
+
+  ASSERT_NO_THROW(net->DebugString());
+
+  ASSERT_EQ(2UL, net->ops_.size());
+
+  f::OperatorBase &d_sigmoid = *net->ops_[0];
+  ASSERT_EQ("sigmoid_grad", d_sigmoid.type_);
+
+  f::OperatorBase &d_mul = *net->ops_[1];
+  ASSERT_EQ("mul_grad", d_mul.type_);
+}
+
+TEST(Backward, net_input_of_network_not_need_grad) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"X", "W1", "b1"},
+                                    {"mul_tmp_0", "add_tmp_0", "hidden0"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"hidden0", "W2", "b2"},
+                                    {"mul_tmp_1", "add_tmp_1", "hidden1"}, {}));
+  net.CompleteAddOp();
+  auto bwd = Backward(net, {"X"});  // X@GRAD is not need.
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+
+  std::unordered_set<std::string> all_output = std::unordered_set<std::string>(
+      bwd_net->outputs_.begin(), bwd_net->outputs_.end());
+  all_output.erase(f::OperatorBase::EMPTY_VAR_NAME());
+
+  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
+    ASSERT_NE(all_output.find(out + f::OperatorBase::GRAD_VAR_SUFFIX()),
+              all_output.end());
+  }
+
+  // Not Generated X
+  ASSERT_EQ(all_output.find("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            all_output.end());
+
+  ASSERT_EQ(2UL, bwd_net->ops_.size());
+  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
+  auto first_fc_grad = static_cast<f::NetOp *>(bwd_net->ops_[1].get());
+  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
+  ASSERT_EQ(
+      f::OperatorBase::EMPTY_VAR_NAME(),
+      first_fc_grad->ops_[2]->Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, net_shared_weight) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("mul", {"X", "W"}, {"Out"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("mul", {"Out", "W"}, {"FinalOut"}, {}));
+  net.CompleteAddOp();
+
+  auto bwd = f::Backward(net, {});
+  ASSERT_TRUE(bwd->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(bwd.get());
+  ASSERT_EQ(3UL, bwd_net->ops_.size());
+  ASSERT_EQ("add", bwd_net->ops_[2]->type_);
+}
+
+TEST(Backward, op_register_grad_not_for_network) {
+  auto fwd = f::OpRegistry::CreateOp(
+      "fc", {"X", "W", "b"}, {"mul_out", "add_out", "out1"},
+      {{"temporary_index", std::vector<int>{0, 1}}});
+
+  ASSERT_THROW(f::OpRegistry::CreateGradOp(*fwd), EnforceNotMet);
+}
+
+TEST(Backward, op_all_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  auto backward = f::Backward(*fwd, {"X", "b"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_all_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("rowwise_add", {"X", "b"}, {"Out"}, {});
+  auto backward = f::Backward(*fwd, {"Out"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_TRUE(net->ops_.empty());
+}
+
+TEST(Backward, op_part_of_output_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("many_output_op", {"X"}, {"Y", "Z"}, {});
+  auto backward = f::Backward(*fwd, {"Z"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_EQ(net->ops_.size(), 2UL);
+
+  auto &fill_zero = *net->ops_[0];
+  ASSERT_EQ("fill_zeros_like", fill_zero.type_);
+  ASSERT_EQ(1UL, fill_zero.inputs_.size());
+  ASSERT_EQ("Z", fill_zero.inputs_[0]);
+  ASSERT_EQ(1UL, fill_zero.outputs_.size());
+  ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(), fill_zero.outputs_[0]);
+
+  auto &d_many_out = *net->ops_[1];
+  ASSERT_EQ("many_output_op_grad", d_many_out.type_);
+  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
+  ASSERT_EQ("Z" + f::OperatorBase::ZERO_VAR_SUFFIX(),
+            d_many_out.Input("z" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+  ASSERT_EQ("Y" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            d_many_out.Input("y" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+  ASSERT_EQ("X" + f::OperatorBase::GRAD_VAR_SUFFIX(),
+            d_many_out.Output("x" + f::OperatorBase::GRAD_VAR_SUFFIX()));
+}
+
+TEST(Backward, op_part_of_input_are_not_need) {
+  auto fwd = f::OpRegistry::CreateOp("mul", {"a", "b"}, {"out"}, {});
+  auto backward = f::Backward(*fwd, {"a"});
+  auto &grad_mul = *backward;
+  ASSERT_EQ(grad_mul.type_, "mul_grad");
+  ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
+  ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
+  ASSERT_EQ(grad_mul.Output("A" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            f::OperatorBase::EMPTY_VAR_NAME());
+  ASSERT_EQ(grad_mul.Output("B" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "b" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  ASSERT_EQ(grad_mul.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+            "out" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  ASSERT_EQ(grad_mul.Input("A"), "a");
+  ASSERT_EQ(grad_mul.Input("B"), "b");
+  ASSERT_EQ(grad_mul.Input("Out"), "out");
+}
+
+TEST(Backward, linear_net_intermediate_variable_has_no_grad) {
+  f::NetOp net;
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"x1", "w1", "b1"},
+                                    {"mul_out1", "add_out1", "out1"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"out1", "w2", "b2"},
+                                    {"mul_out2", "tmp_out2", "out2"}, {}));
+  net.AddOp(f::OpRegistry::CreateOp("fc", {"out2", "w3", "b3"},
+                                    {"mul_out3", "tmp_out3", "out3"}, {}));
+  net.CompleteAddOp();
+  auto backward = f::Backward(net, {"mul_out2", "tmp_out2", "out2"});
+  ASSERT_TRUE(backward->IsNetOp());
+  auto bwd_net = static_cast<f::NetOp *>(backward.get());
+  ASSERT_EQ(bwd_net->ops_.size(), 3UL);
+  auto &grad_fc = *bwd_net->ops_[0];
+  EXPECT_EQ(grad_fc.inputs_.size(),
+            3UL       /* external input number */
+                + 1UL /* external output number*/
+                + 1UL /* number of gradient of external output*/
+                - 1UL /*ignoreGradient varable number*/
+                + 2U /* internal variable number*/);
+  EXPECT_EQ(grad_fc.outputs_.size(), 2UL       /* input number of mul*/
+                                         + 2UL /* input number of rowwise_add */
+                                         + 1UL /* input number of sigmod */);
+  EXPECT_EQ(bwd_net->ops_[1]->inputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[1]->outputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->inputs_.size(), 0UL);
+  EXPECT_EQ(bwd_net->ops_[2]->outputs_.size(), 0UL);
+
+  /*
+    EXPECT_EQ(grad_fc.Output("X" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+              f::OperatorBase::EMPTY_VAR_NAME());
+  EXPECT_EQ(grad_fc.Output("W" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+    "w3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Output("b" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+    "b3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Output("mul_result" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+  "mul_out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+
+  EXPECT_EQ(grad_fc.Input("Out" + f::OperatorBase::GRAD_VAR_SUFFIX()),
+  "out3" + f::OperatorBase::GRAD_VAR_SUFFIX());
+  EXPECT_EQ(grad_fc.Input("X"), "out2");
+  EXPECT_EQ(grad_fc.Input("W"), "w3");
+  EXPECT_EQ(grad_fc.Input("mul_result"), "mul_out3");
+  EXPECT_EQ(grad_fc.Input("add_result"), "tmp_out3");
+  EXPECT_EQ(grad_fc.Input("Out"), "out3");
+  */
+}
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595ea326b97ac567daf9b35a68c8cf7f8..545c1dcc2a1682839d90194002fdbb748d85e808 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,9 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
 
-///@cond HIDDEN
+/// @cond HIDDEN
 
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
   }
 }
 
-///@endcond
+/// @endcond
 
 DDim make_ddim(std::initializer_list<int> dims) {
   DDim result(make_dim(0));
@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
   return result;
 }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
  public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int& operator()(Dim<D>& dim) const {
@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 
 class DynamicConstIndexer : public boost::static_visitor<int> {
  public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int operator()(const Dim<D>& dim) const {
@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
   int idx_;
 };
 
-///@endcond
+/// @endcond
 
 int& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
   std::vector<int>& vector;
 
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 
   void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 
 std::vector<int> vectorize(const DDim& ddim) {
   std::vector<int> result;
@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
   return result;
 }
 
+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
+
 ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
-  std::vector<int> v = vectorize(ddim);
-  for (auto i : v) {
-    result *= i;
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
   }
-  return result;
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
 }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct ArityVisitor : boost::static_visitor<int> {
   template <int D>
@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
   }
 };
 
-///\endcond
+/// \endcond
 
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct DDimPrinter : boost::static_visitor<void> {
   std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
   }
 };
 
-///\endcond
+/// \endcond
 
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   DDimPrinter printer(os);
@@ -220,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180bee45e21547364441476b27051daca56..9fcc657edcd5459d0a42a64d708603a4bcd53cf0 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -1,11 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
-
 #include "paddle/framework/dim.h"
+#include "paddle/platform/enforce.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace framework {
@@ -27,7 +42,9 @@ struct DDim {
   DDim() : var(Dim<1>()) {}
 
   template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
+
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
@@ -57,6 +74,8 @@ struct DDim {
   DDim operator+(DDim d) const;
 
   DDim operator*(DDim d) const;
+
+  ssize_t size() const;
 };
 
 /**
@@ -81,6 +100,15 @@ std::vector<int> vectorize(const DDim& ddim);
 
 ssize_t product(const DDim& ddim);
 
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
 /**
  * \brief What is the length of this dimension?
  *
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 36eef02370e0196c2af2c05f49176b70ce69235a..9d18a2972ce62139430b240b4599854b14290a32 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,9 +49,30 @@ TEST(DDim, Equality) {
 
   // arity of a DDim
   EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
 
   // product of a DDim
   EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }
 
 TEST(DDim, Print) {
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index bcde291d12d429a3f2cd41fa6d0ee606c7c9c92f..883fdc55eb929ebc51e8ae05938e9d07374406ce 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -266,29 +266,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
   return ((0 <= idx.head) && (idx.head < size.head));
 }
 
-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
@@ -306,31 +283,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond
 
-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
  * Add two dimensions together
  */
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 809bf04826637195425a32c054c94e00ef940df9..3898d0a447aa502813b3cb5e86c29eebb814ff84 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -1,128 +1,101 @@
 #include <thrust/device_vector.h>
 #include <sstream>
 
-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
 
 __global__ void test(paddle::framework::Dim<2>* o) {
-    o[0] = paddle::framework::make_dim(5, 6);
+  o[0] = paddle::framework::make_dim(5, 6);
 }
 
 __global__ void dyn_idx_gpu(int* o) {
-    auto d = paddle::framework::make_dim(5, 6);
-    o[0] = d[1];
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
 }
 
 TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = paddle::framework::make_dim(3, 4);
-    EXPECT_EQ(paddle::framework::get<0>(a), 3);
-    EXPECT_EQ(paddle::framework::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(paddle::framework::get<0>(a), 5);
-    EXPECT_EQ(paddle::framework::get<1>(a), 6);
-
-    // linearization
-    auto b = paddle::framework::make_dim(7, 8);
-    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(paddle::framework::product(a), 30);
-
-    // mutate a Dim
-    paddle::framework::get<1>(b) = 10;
-    EXPECT_EQ(paddle::framework::get<0>(b), 7);
-    EXPECT_EQ(paddle::framework::get<1>(b), 10);
-
-    // dynamic access
-    paddle::framework::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(paddle::framework::get<0>(b), 8);
-    EXPECT_EQ(paddle::framework::get<1>(b), 11);
-    EXPECT_EQ(paddle::framework::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 12);
-
-    // contiguous_strides
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 0);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 10);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 0);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 6);
-
-    // generate from an index
-    auto size = paddle::framework::make_dim(4, 5, 2);
-    c = paddle::framework::Dim<3>(14, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 2);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::Dim<3>(25, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
 }
 
 TEST(Dim, Bool) {
-    auto a = paddle::framework::make_dim(3, 4);
-    auto b = paddle::framework::make_dim(5, 6);
-    auto c = paddle::framework::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(paddle::framework::contained(a, b));
-    EXPECT_FALSE(paddle::framework::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    paddle::framework::Dim<3> sizef(x, y, z);
-    paddle::framework::Dim<3> stridea(1, x, x*y);
-    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
-    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
 }
 
 TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = paddle::framework::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << paddle::framework::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
 }
diff --git a/paddle/framework/eigen.h b/paddle/framework/eigen.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4667cc51fadfc020d3211b7a82356db386fced1
--- /dev/null
+++ b/paddle/framework/eigen.h
@@ -0,0 +1,100 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
+    Type ret;
+    for (int d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor, DDim dims) {
+    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
+
+  static ConstType From(const Tensor& tensor, DDim dims) {
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const Tensor& tensor) {
+    return From(tensor, tensor.dims_);
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+  }
+
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenScalar {
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  using Type = Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, MajorType, IndexType>>;
+  using ConstType = Eigen::TensorMap<
+      Eigen::TensorFixedSize<const T, Eigen::Sizes<>, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor) { return Type(tensor.data<T>()); }
+
+  static ConstType From(const Tensor& tensor) {
+    return ConstType(tensor.data<T>());
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/eigen_test.cc b/paddle/framework/eigen_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dc1957691b1a202826e10e84c21ac8874df9e378
--- /dev/null
+++ b/paddle/framework/eigen_test.cc
@@ -0,0 +1,112 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/eigen.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+TEST(EigenDim, From) {
+  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
+  ASSERT_EQ(1, ed[0]);
+  ASSERT_EQ(2, ed[1]);
+  ASSERT_EQ(3, ed[2]);
+}
+
+TEST(Eigen, Tensor) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+
+  ASSERT_EQ(1, et.dimension(0));
+  ASSERT_EQ(2, et.dimension(1));
+  ASSERT_EQ(3, et.dimension(2));
+
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 3; k++) {
+        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
+      }
+    }
+  }
+}
+
+TEST(Eigen, ScalarFrom) {
+  Tensor t;
+  int* p = t.mutable_data<int>(make_ddim({1}), platform::CPUPlace());
+  *p = static_cast<int>(100);
+
+  EigenScalar<int>::Type es = EigenScalar<int>::From(t);
+
+  ASSERT_EQ(0, es.dimension(0));
+  ASSERT_EQ(100, es(0));
+}
+
+TEST(Eigen, VectorFrom) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
+  for (int i = 0; i < 6; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::From(t);
+
+  ASSERT_EQ(6, ev.dimension(0));
+
+  for (int i = 0; i < 6; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, VectorFlatten) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
+
+  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
+
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, Matrix) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
+
+  ASSERT_EQ(2, em.dimension(0));
+  ASSERT_EQ(3, em.dimension(1));
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/enforce.h b/paddle/framework/enforce.h
deleted file mode 100644
index 56cb7f95647e81efef58b156002d0d378ee22820..0000000000000000000000000000000000000000
--- a/paddle/framework/enforce.h
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <paddle/string/printf.h>
-#include <exception>
-#include <sstream>
-
-namespace paddle {
-namespace framework {
-
-/**
- * @brief Enforce exception. Inherits std::exception
- *
- * All enforce condition not met, will throw an EnforceNotMet exception.
- */
-class EnforceNotMet : public std::exception {
- public:
-  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
-    std::ostringstream sout;
-    sout << msg << " at [" << file << ":" << fileline << "];";
-    all_msg_ = sout.str();
-  }
-
-  const char* what() const noexcept override { return all_msg_.c_str(); }
-
- private:
-  std::string all_msg_;
-};
-
-// From https://stackoverflow.com/questions/30130930/
-// __buildin_expect is in C++ 11 standard. Since the condition which enforced
-// should be true in most situation, it will make the compiler generate faster
-// code by adding `UNLIKELY` macro.
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-
-/**
- * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
- * __LINE__
- *
- * This macro take __VA_ARGS__, user can pass any type if that type can
- * serialize to std::ostream
- */
-#define PADDLE_THROW(...)                                            \
-  do {                                                               \
-    throw ::paddle::framework::EnforceNotMet(                        \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
-
-/**
- * @brief Enforce a condition, otherwise throw an EnforceNotMet
- */
-#define PADDLE_ENFORCE(condition, ...) \
-  do {                                 \
-    if (UNLIKELY(!(condition))) {      \
-      PADDLE_THROW(__VA_ARGS__);       \
-    }                                  \
-  } while (0)
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder.cc b/paddle/framework/grad_op_builder.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dd686cc78246f06cdc3ec7d013086863d7e8fac0
--- /dev/null
+++ b/paddle/framework/grad_op_builder.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/grad_op_builder.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+OperatorBase* GradOpBuilder::Build() {
+  BuildOpInOutArgList();
+  std::string grad_op_type = OpRegistry::grad_ops().at(op_.type_);
+  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
+  grad_op->type_ = grad_op_type;
+  CompleteGradOp(grad_op);
+  return grad_op;
+}
+
+OpInOutArg* GradOpBuilder::BuildArg(const VarProto& var,
+                                    const VarIndexMap& var_map,
+                                    const std::vector<int>& format,
+                                    InOutType type) {
+  int idx = var_map.at(var.name());
+  int begin_idx = format.empty() ? idx : format.at(idx);
+  int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
+  return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
+                        end_idx);
+}
+
+void GradOpBuilder::BuildOpInOutArgList() {
+  const OpProto& op_proto = OpRegistry::protos().at(op_.type_);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_.type_));
+  const std::vector<int>& in_format =
+      op_.attrs_.count("input_format")
+          ? op_.GetAttr<std::vector<int>>("input_format")
+          : std::vector<int>();
+  const std::vector<int>& out_format =
+      op_.attrs_.count("output_format")
+          ? op_.GetAttr<std::vector<int>>("output_format")
+          : std::vector<int>();
+  for (const auto& var : op_proto.inputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
+  }
+  for (const auto& var : op_proto.outputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
+  }
+}
+
+void GradOpBuilder::AddArgIntoGradOp(const OpInOutArg* arg,
+                                     std::vector<std::string>& in_out,
+                                     std::vector<int>& format,
+                                     VarIndexMap* varmap, int& idx,
+                                     bool is_grad) const {
+  std::string var_name = arg->proto_name_;
+  if (is_grad) {
+    var_name += OperatorBase::GRAD_VAR_SUFFIX();
+  }
+  (*varmap)[var_name] = idx++;
+  size_t pre_sz = in_out.size();
+  auto base_it = arg->type_ == IN ? op_.inputs_.begin() : op_.outputs_.begin();
+  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
+            std::back_inserter(in_out));
+  if (is_grad) {
+    for (size_t i = pre_sz; i < in_out.size(); ++i) {
+      in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
+    }
+  }
+  format.push_back(in_out.size());
+}
+
+void GradOpBuilder::CompleteGradOp(OperatorBase* grad_op) const {
+  grad_op->attrs_ = op_.attrs_;
+  grad_op->attrs_.erase("input_format");
+  grad_op->attrs_.erase("output_format");
+  VarIndexMap* grad_varmap = new VarIndexMap();
+  int in_idx = 0;
+  int out_idx = 0;
+  std::vector<int> in_format({0});
+  std::vector<int> out_format({0});
+  for (const auto& arg : arg_list_) {
+    // op_'s inputs_ and outputs_
+    if (arg->needed_in_grad_) {
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, false);
+    }
+    if (arg->type_ == IN) {
+      // gradients of op_'s inputs_
+      AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
+                       out_idx, true);
+    } else {
+      // gradients of op_'s outputs_
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, true);
+    }
+  }
+  grad_op->attrs_["input_format"] = in_format;
+  grad_op->attrs_["output_format"] = out_format;
+  grad_op->in_out_idxs_.reset(grad_varmap);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder.h b/paddle/framework/grad_op_builder.h
new file mode 100644
index 0000000000000000000000000000000000000000..cc7a76f3726e00a08fbe06bca4c9b9f5bad466b4
--- /dev/null
+++ b/paddle/framework/grad_op_builder.h
@@ -0,0 +1,48 @@
+#pragma once
+
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class OpRegistry;
+
+enum InOutType { IN, OUT };
+
+struct OpInOutArg {
+  OpInOutArg(const std::string& proto_name, const InOutType& type,
+             bool needed_in_grad, size_t begin_idx, size_t end_idx)
+      : proto_name_(proto_name),
+        type_(type),
+        needed_in_grad_(needed_in_grad),
+        begin_idx_(begin_idx),
+        end_idx_(end_idx) {}
+
+  std::string proto_name_;
+  InOutType type_;
+  bool needed_in_grad_;
+  size_t begin_idx_;
+  size_t end_idx_;
+};
+
+class GradOpBuilder {
+  using VarIndexMap = std::unordered_map<std::string, int>;
+
+ public:
+  GradOpBuilder(const OperatorBase& op) : op_(op) {}
+  OperatorBase* Build();
+
+ private:
+  OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
+                       const std::vector<int>& format, InOutType type);
+  void BuildOpInOutArgList();
+  void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
+                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
+                        bool is_grad) const;
+  void CompleteGradOp(OperatorBase* grad_op) const;
+  const OperatorBase& op_;
+  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/grad_op_builder_test.cc b/paddle/framework/grad_op_builder_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e9cf3b9798db2cbfb8d26259ae9a6741fbae8278
--- /dev/null
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -0,0 +1,26 @@
+#include "paddle/framework/grad_op_builder.h"
+#include <gtest/gtest.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+USE_OP(add_two);
+
+namespace paddle {
+namespace framework {
+
+TEST(GradOpBuilder, AddTwo) {
+  std::shared_ptr<OperatorBase> add_op(
+      OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  std::shared_ptr<OperatorBase> grad_add_op = OpRegistry::CreateGradOp(*add_op);
+  EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
+  EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
+  EXPECT_EQ(grad_add_op->Input("X"), "x");
+  EXPECT_EQ(grad_add_op->Input("Y"), "y");
+  EXPECT_EQ(grad_add_op->Input("Out"), "out");
+  EXPECT_EQ(grad_add_op->Input("Out@GRAD"), "out@GRAD");
+  EXPECT_EQ(grad_add_op->Output("X@GRAD"), "x@GRAD");
+  EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/images/duplicate_op.graffle b/paddle/framework/images/duplicate_op.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5979f792e252f028a615729215529c2be42d9165
Binary files /dev/null and b/paddle/framework/images/duplicate_op.graffle differ
diff --git a/paddle/framework/images/duplicate_op.png b/paddle/framework/images/duplicate_op.png
new file mode 100644
index 0000000000000000000000000000000000000000..f299c5d37f260a1bb0daec886f0a4ee1c1f31c92
Binary files /dev/null and b/paddle/framework/images/duplicate_op.png differ
diff --git a/paddle/framework/images/duplicate_op2.graffle b/paddle/framework/images/duplicate_op2.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..2b658085d6a55d368c320051ba7f94ec2900f13c
Binary files /dev/null and b/paddle/framework/images/duplicate_op2.graffle differ
diff --git a/paddle/framework/images/duplicate_op2.png b/paddle/framework/images/duplicate_op2.png
new file mode 100644
index 0000000000000000000000000000000000000000..c5588015d1450fd8c1bda3580680d884494868bb
Binary files /dev/null and b/paddle/framework/images/duplicate_op2.png differ
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2cd378c6b21303d1a24206ba3010b0d035aaa766
--- /dev/null
+++ b/paddle/framework/net.cc
@@ -0,0 +1,78 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+void NetOp::CompleteAddOp(bool calc) {
+  add_op_done_ = true;
+  if (!calc) return;
+  std::unordered_set<std::string> input_set;
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
+  for (auto& op : ops_) {
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
+  }
+
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+  std::sort(inputs_.begin(), inputs_.end());
+
+  outputs_.reserve(output_set.size());
+  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs_));
+  std::sort(outputs_.begin(), outputs_.end());
+
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int output_len = static_cast<int>(outputs_.size());
+  for (int i = 0; i < output_len; ++i) {
+    if (Contains(temp_output, outputs_[i])) {
+      tmp_index.push_back(i);
+    }
+  }
+
+  attrs_["temporary_index"] = tmp_index;
+}
+
+std::string NetOp::DebugString() const {
+  std::ostringstream os;
+  os << OperatorBase::DebugString() << std::endl;
+  for (auto& op : ops_) {
+    std::istringstream is(op->DebugString());
+    for (std::string line; std::getline(is, line);) {
+      os << "    " << line << std::endl;
+    }
+  }
+  return os.str();
+}
+
+bool NetOp::IsNetOp() const { return true; }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
new file mode 100644
index 0000000000000000000000000000000000000000..acf1a69da9fd8adce1bd89367c882eade052e725
--- /dev/null
+++ b/paddle/framework/net.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/scope.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+/**
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the APIs
+ * it defines.
+ */
+class NetOp : public OperatorBase {
+ public:
+  /**
+   * Infer all the operators' input and output variables' shapes, will be called
+   * before every mini-batch
+   */
+  void InferShape(const Scope& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
+
+  /**
+   * @brief Add an operator by ptr
+   */
+  void AddOp(const std::shared_ptr<OperatorBase>& op) {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
+    ops_.push_back(op);
+  }
+
+  void InsertOp(size_t pos, const std::shared_ptr<OperatorBase>& op) {
+    PADDLE_ENFORCE(!add_op_done_,
+                   "Cannot InsertOp when this network is sealed");
+    PADDLE_ENFORCE(op != nullptr, "Cannot Insert Null op");
+    PADDLE_ENFORCE(pos <= ops_.size(), "Out of range");
+    ops_.insert(ops_.begin() + pos, op);
+  }
+
+  void CompleteAddOp(bool calculate = true);
+
+  std::string DebugString() const override;
+
+  bool IsNetOp() const override;
+
+  std::vector<std::shared_ptr<OperatorBase>> ops_;
+
+ private:
+  bool add_op_done_{false};
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f32e456e5d142bf8203f9ec03e8059772c4f5c99
--- /dev/null
+++ b/paddle/framework/net_op_test.cc
@@ -0,0 +1,89 @@
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace framework {
+
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+
+class TestOp : public OperatorBase {
+ public:
+  void InferShape(const framework::Scope& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const framework::Scope& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+
+class EmptyOp : public OperatorBase {
+ public:
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<NetOp>();
+  ASSERT_NE(net, nullptr);
+
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+
+  Scope scope;
+  platform::CPUDeviceContext dev_ctx;
+
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+  ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
+}
+
+TEST(Net, insert_op) {
+  NetOp net;
+  auto op1 = std::make_shared<EmptyOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net.AddOp(op1);
+  net.InsertOp(0, op1);
+  ASSERT_EQ(2UL, net.ops_.size());
+  net.InsertOp(2, op1);
+  ASSERT_EQ(3UL, net.ops_.size());
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
index 22df6f9c6b70277ddbf31e0432401889e3aa7483..366c84e53dc29e41eefbaef0a6452e01c4fe37bd 100644
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -34,6 +34,11 @@ message AttrProto {
 
     // Supported attribute comments. It helps 3rd-party language generate doc-string.
     required string comment = 3;
+
+    // If that attribute is generated, it means the Paddle third language
+    // binding has responsibility to fill that attribute. End-User should
+    // not set that attribute.
+    optional bool generated = 4 [default=false];
 }
 
 // Input or output message for 3rd-party language binding.
@@ -45,6 +50,45 @@ message VarProto {
 
     // The comment for that input. It helps 3rd-party language generate doc-string.
     required string comment = 2;
+
+    // Is that input/output could be a list or not.
+    // If so, that Op should write a attributed named `input_format` or
+    // `output_format`.
+    //
+    // e.g.
+    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+    //   will hold a attribute of them.
+    //
+    //   The Op desc of same fc could be
+    //   {
+    //      "type": "fc",
+    //      "input": ["X1", "X2", "W1", "W2", "b"],
+    //      "output": "fc.out",
+    //      "attrs" : {
+    //        "input_format": [0, 2, 4, 5]
+    //      }
+    //   }
+    //
+    optional bool multiple = 3 [default=false];
+
+    // It marks that output is a temporary output. That output is not used by
+    // user, but used by other op internally as input. If other op is not use
+    // that output, it could be optimized early.
+    //
+    // Attribute temporary_index will be set in OpDesc if there is some
+    // outputs are temporary.
+    //
+    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+    // attrs = {
+    //   "temporary_index": [1]
+    // }
+    optional bool temporary = 4 [default=false];
+
+    // The gradient of operator can be ignored immediately
+    // e.g. operator AddOp, y = x1 + x2, the gradient of dy/dx1, dy/dx2
+    // can be ignored for the future optimized on graph.
+    optional bool ignore_gradient = 6;
 }
 
 // Op protocol message for 3rd-party language binding.
@@ -66,4 +110,5 @@ message OpProto {
 
     // The type of that Op.
     required string type = 5;
+
 }
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d14535c50b542733663a6900a8b5f2033290ea6
--- /dev/null
+++ b/paddle/framework/op_registry.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+
+namespace paddle {
+namespace framework {
+
+template <>
+void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRING);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INTS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOATS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRINGS);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 81241b5342d8900c205dd62f2a62dc2496010560..3e72e391266066de9e4114e68b43b066c15254db 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,26 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
+#include <algorithm>
+#include <atomic>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include "paddle/framework/attr_checker.h"
-
-//#include "paddle/framework/op_base.h"
+#include "paddle/framework/grad_op_builder.h"
 #include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
 
 namespace paddle {
 namespace framework {
 
-//==================For test================//
-class OpBase {
- public:
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
-  AttributeMap attr_map_;
-
-  virtual std::string Run() const = 0;
-  virtual ~OpBase() {}
-};
-//=========================================//
-
 // helper class to set attribute type
 struct AttrTypeHelper {
   template <typename T>
@@ -64,190 +70,404 @@ struct AttrTypeHelper {
   }
 };
 
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
-
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate() {
+    validated_ = true;
+    CheckNoDuplicatedInOutAttrs();
+  }
+
  protected:
-  void AddInput(const std::string& name, const std::string& comment) {
+  struct VariableBuilder {
+    VarProto* var_;
+    std::function<void()> on_multiple_;
+    std::function<void()> on_temporary_;
+
+    VariableBuilder& SetMultiple() {
+      var_->set_multiple(true);
+      on_multiple_();
+      return *this;
+    }
+
+    VariableBuilder& SetTemporary() {
+      PADDLE_ENFORCE(bool(on_temporary_), "Cannot set temporary");
+      var_->set_temporary(true);
+      on_temporary_();
+      return *this;
+    }
+
+    VariableBuilder& IgnoreGradient() {
+      var_->set_ignore_gradient(true);
+      return *this;
+    }
+  };
+
+  VariableBuilder AddInput(const std::string& name,
+                           const std::string& comment) {
     auto input = proto_->mutable_inputs()->Add();
-    *(input->mutable_name()) = name;
-    *(input->mutable_comment()) = comment;
+    *input->mutable_name() = name;
+    *input->mutable_comment() = comment;
+    return VariableBuilder{input, [=] { this->SetHasMultipleInput(); },
+                           nullptr};
   }
 
-  void AddOutput(const std::string& name, const std::string& comment) {
+  VariableBuilder AddOutput(const std::string& name,
+                            const std::string& comment) {
     auto output = proto_->mutable_outputs()->Add();
-    *(output->mutable_name()) = name;
-    *(output->mutable_comment()) = comment;
+    *output->mutable_name() = name;
+    *output->mutable_comment() = comment;
+    return VariableBuilder{output, [=] { this->SetHasMultipleOutput(); },
+                           [=] { this->SetHasTemporaryOutput(); }};
   }
 
   template <typename T>
   TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment) {
+                               const std::string& comment,
+                               bool generated = false) {
     auto attr = proto_->mutable_attrs()->Add();
-    *(attr->mutable_name()) = name;
-    *(attr->mutable_comment()) = comment;
+    *attr->mutable_name() = name;
+    *attr->mutable_comment() = comment;
+    attr->set_generated(generated);
     AttrTypeHelper::SetAttrType<T>(attr);
     return op_checker_->AddAttrChecker<T>(name);
   }
 
-  void AddType(const std::string& op_type) { proto_->set_type(op_type); }
-
   void AddComment(const std::string& comment) {
     *(proto_->mutable_comment()) = comment;
   }
 
+ private:
+  void SetHasMultiple(const std::string& in_out, bool* flag) {
+    if (!*flag) {
+      AddAttr<std::vector<int>>(in_out + "_format",
+                                "The multiple index of " + in_out +
+                                    "\n"
+                                    R"DOC(
+This attribute is used by Paddle core framework. Paddle's Op support each input
+or output could be a list of variable. This attribute is used to show how that
+list organized.
+
+e.g.
+  input = ["a", "b", "c", "d", "e", "f"]
+  input_format = [0, 4, 5, 6]
+
+means
+  The number of all input variables this op is six, and they are segmented into
+  three inputs.
+
+  The first input is input[0:4], second is input[4:5], third is input[5:6].
+)DOC",
+                                /*generated*/ true);
+      *flag = true;
+    }
+  }
+
+  void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); }
+  void SetHasMultipleOutput() {
+    SetHasMultiple("output", &has_multiple_output_);
+  }
+
+  void SetHasTemporaryOutput() {
+    if (!has_temporary_output_) {
+      AddAttr<std::vector<int>>("temporary_index",
+                                R"DOC(The temporary index of output.
+
+Not all output of Paddle Op is used by user. For faster computation, each op
+could output some its internal state to other op, other op could take that
+output to make compute faster.
+
+Add a mark to which output is temporary is helpful for future optimization.
+)DOC",
+                                /*generated*/ true)
+          .SetDefault(std::vector<int>());
+      has_temporary_output_ = true;
+    }
+  }
+
+  void CheckNoDuplicatedInOutAttrs() {
+    std::unordered_set<std::string> names;
+    auto checker = [&](const std::string& name) {
+      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+      names.insert(name);
+    };
+    for (auto& attr : proto_->attrs()) {
+      checker(attr.name());
+    }
+    for (auto& input : proto_->inputs()) {
+      checker(input.name());
+    }
+    for (auto& output : proto_->outputs()) {
+      checker(output.name());
+    }
+  }
+
   OpProto* proto_;
   OpAttrChecker* op_checker_;
+  bool validated_{false};
+  bool has_multiple_input_{false};
+  bool has_multiple_output_{false};
+  bool has_temporary_output_{false};
 };
 
 class OpRegistry {
-  typedef std::function<OpBase*()> OpCreator;
+  using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;
+  using VarNameList = std::vector<std::string>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
   static void RegisterOp(const std::string& op_type) {
-    creators_[op_type] = []() { return new OpType; };
-    OpProto& op_proto = protos_[op_type];
-    OpAttrChecker& op_checker = op_checkers_[op_type];
-    ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized() == true,
-                   "Fail to initialize %s's OpProto !", op_type);
-  }
-
-  static OpBase* CreateOp(const OpDesc& op_desc) {
-    std::string op_type = op_desc.type();
-    OpBase* op = (creators_.at(op_type))();
-    (op->inputs_).resize(op_desc.inputs_size());
-    for (int i = 0; i < op_desc.inputs_size(); ++i) {
-      (op->inputs_)[i] = op_desc.inputs(i);
+    op_creators()[op_type] = [] { return new OpType; };
+    OpAttrChecker& op_checker = op_checkers()[op_type];
+    OpProto& op_proto = protos()[op_type];
+    auto maker = ProtoMakerType(&op_proto, &op_checker);
+    maker.Validate();
+    *op_proto.mutable_type() = op_type;
+    PADDLE_ENFORCE(
+        op_proto.IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, op_proto.InitializationErrorString());
+
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
     }
-    (op->outputs_).resize(op_desc.outputs_size());
-    for (int i = 0; i < op_desc.outputs_size(); ++i) {
-      (op->outputs_)[i] = op_desc.outputs(i);
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
     }
-    for (int i = 0; i < op_desc.attrs_size(); ++i) {
-      const AttrDesc& ith_attr = op_desc.attrs(i);
-      std::string name = ith_attr.name();
-      (op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
+  }
+
+  template <typename GradOpType>
+  static void RegisterGradOp(const std::string& op_type,
+                             const std::string& grad_op_type) {
+    op_creators()[grad_op_type] = [] { return new GradOpType; };
+    grad_ops()[op_type] = grad_op_type;
+  }
+
+  static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
+                                                const VarNameList& inputs,
+                                                const VarNameList& outputs,
+                                                const AttributeMap& attrs) {
+    auto op_create_it = op_creators().find(type);
+    PADDLE_ENFORCE(op_create_it != op_creators().end(),
+                   "Operator %s cannot be found.", type);
+
+    auto op = op_create_it->second();
+    op->type_ = type;
+    op->inputs_ = inputs;
+    op->outputs_ = outputs;
+
+    op->attrs_ = attrs;
+    op_checkers().at(type).Check(op->attrs_);
+
+    GenerateTempVariableName(op);
+
+    {
+      auto var_index_it = VarIndexMaps().find(type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
     }
-    const OpAttrChecker& op_checker = op_checkers_.at(op_type);
-    op_checker.Check(op->attr_map_);
-    return op;
+
+    op->Init();
+    return std::shared_ptr<OperatorBase>(op);
   }
 
- private:
-  static std::unordered_map<std::string, OpCreator> creators_;
-  static std::unordered_map<std::string, OpProto> protos_;
-  static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
-};
+  static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
+    std::vector<std::string> inputs;
+    inputs.reserve((size_t)op_desc.inputs_size());
+    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
+              std::back_inserter(inputs));
 
-std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
-std::unordered_map<std::string, OpProto> OpRegistry::protos_;
-std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
+    std::vector<std::string> outputs;
+    outputs.reserve((size_t)op_desc.outputs_size());
+    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
+              std::back_inserter(outputs));
 
-template <typename OpType, typename ProtoMakerType>
-class OpRegisterHelper {
- public:
-  OpRegisterHelper(std::string op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
+    AttributeMap attrs;
+    for (auto& attr : op_desc.attrs()) {
+      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+    }
+
+    return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
-};
 
-#define REGISTER_OP(__op_class, __op_maker_class, __op_type)         \
-  class __op_class##Register {                                       \
-   private:                                                          \
-    const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
-  };                                                                 \
-  const OpRegisterHelper<__op_class, __op_maker_class>               \
-      __op_class##Register::reg(#__op_type);
+  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
+    PADDLE_ENFORCE(!op.IsNetOp(),
+                   "Use framework::Backward to get backward ops");
+    GradOpBuilder builder(op);
+    std::shared_ptr<OperatorBase> grad_op(builder.Build());
+    grad_op->Init();
+    return grad_op;
+  }
 
-// Demos
+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
 
-class CosineOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg = "CosineOp runs! scale = " +
-                      std::to_string(boost::get<float>(attr_map_.at("scale")));
-    return msg;
+  static std::unordered_map<std::string, std::string>& grad_ops() {
+    static std::unordered_map<std::string, std::string> grad_ops_;
+    return grad_ops_;
   }
-};
 
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
   }
-};
 
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+  static std::unordered_map<std::string, OpCreator>& op_creators() {
+    static std::unordered_map<std::string, OpCreator> op_creators_;
+    return op_creators_;
+  }
+
+ private:
+  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
+    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+    return op_checkers_;
+  };
+
+  static void GenerateTempVariableName(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->type_;
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+};
 
-class MyTestOp : public OpBase {
+template <typename OpType, typename ProtoMakerType>
+class OpRegisterHelper {
  public:
-  virtual std::string Run() const {
-    std::string msg =
-        "MyTestOp runs! test_attr = " +
-        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
-    return msg;
+  OpRegisterHelper(const char* op_type) {
+    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
   }
 };
 
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+template <typename GradOpType>
+class GradOpRegisterHelper {
  public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
+  GradOpRegisterHelper(const char* op_type, const char* grad_op_type) {
+    OpRegistry::RegisterGradOp<GradOpType>(op_type, grad_op_type);
   }
 };
 
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+/**
+ * Macro to Register Operator.
+ */
+#define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
+                                 "REGISTER_OP must be in global namespace"); \
+  static ::paddle::framework::OpRegisterHelper<__op_class, __op_maker_class> \
+      __op_register_##__op_type##__(#__op_type);                             \
+  int __op_register_##__op_type##_handle__() { return 0; }
+
+/**
+ * Macro to Register Gradient Operator.
+ */
+#define REGISTER_GRADIENT_OP(__op_type, __grad_op_type, __grad_op_class)       \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
+      __reg_gradient_op__##__op_type##__grad_op_type,                          \
+      "REGISTER_GRADIENT_OP must be in global namespace");                     \
+  static ::paddle::framework::GradOpRegisterHelper<__grad_op_class>            \
+      __op_gradient_register_##__op_type##__grad_op_type##__(#__op_type,       \
+                                                             #__grad_op_type); \
+  int __op_gradient_register_##__op_type##__grad_op_type##_handle__() {        \
+    return 0;                                                                  \
+  }
+
+/**
+ * Macro to Forbid user register Gradient Operator.
+ */
+#define NO_GRADIENT(__op_type)                          \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                       \
+      __reg_gradient_op__##__op_type##__op_type##_grad, \
+      "NO_GRADIENT must be in global namespace")
+
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
+      "REGISTER_OP_KERNEL must be in global namespace");                  \
+  struct __op_kernel_register__##type##__##DEVICE_TYPE##__ {              \
+    __op_kernel_register__##type##__##DEVICE_TYPE##__() {                 \
+      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
+      key.place_ = PlaceType();                                           \
+      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
+          .reset(new __VA_ARGS__());                                      \
+    }                                                                     \
+  };                                                                      \
+  static __op_kernel_register__##type##__##DEVICE_TYPE##__                \
+      __reg_kernel_##type##__##DEVICE_TYPE##__;                           \
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
+
+// (type, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+
+// (type, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
+#define USE_OP_WITHOUT_KERNEL(op_type)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
+      __use_op_without_kernel_##op_type,                    \
+      "USE_OP_WITHOUT_KERNEL must be in global namespace"); \
+  extern int __op_register_##op_type##_handle__();          \
+  static int __use_op_ptr_##op_type##_without_kernel__      \
+      __attribute__((unused)) = __op_register_##op_type##_handle__()
+
+#define USE_OP_KERNEL(op_type, DEVICE_TYPE)                               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
+      "USE_OP_KERNEL must be in global namespace");                       \
+  extern int __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__(); \
+  static int __use_op_ptr_##op_type##_##DEVICE_TYPE##_kernel__            \
+      __attribute__((unused)) =                                           \
+          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
+
+// use Operator with only cpu kernel.
+#define USE_OP_CPU(op_type)       \
+  USE_OP_WITHOUT_KERNEL(op_type); \
+  USE_OP_KERNEL(op_type, CPU)
+
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
+#else
+#define USE_OP(op_type) \
+  USE_OP_CPU(op_type);  \
+  USE_OP_KERNEL(op_type, GPU)
+#endif
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 17849ca0191db644884e766342b30461abf50298..9894928a7aa19bc6c7ad8b230562fb9a681cfebd 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,25 +1,78 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace framework {
+class CosineOp : public OperatorBase {
+ public:
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+  void InferShape(const Scope& scope) const override {}
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is cos op");
+  }
+};
+
+class MyTestOp : public OperatorBase {
+ public:
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op").SetMultiple();
+    AddOutput("output", "output of cosine op").SetTemporary();
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddComment("This is my_test op");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(cos_sim, paddle::framework::CosineOp,
+            paddle::framework::CosineOpProtoAndCheckerMaker);
+REGISTER_OP(my_test_op, paddle::framework::MyTestOp,
+            paddle::framework::MyTestOpProtoAndCheckerMaker);
+
 TEST(OpRegistry, CreateOp) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
+  float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.3);
+  attr->set_f(scale);
 
-  paddle::framework::OpBase* op =
+  std::shared_ptr<paddle::framework::OperatorBase> op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  paddle::framework::Scope scope;
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
+  float scale_get = op->GetAttr<float>("scale");
+  ASSERT_EQ(scale_get, scale);
 }
 
 TEST(OpRegistry, IllegalAttr) {
@@ -35,9 +88,8 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::framework::EnforceNotMet err) {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "larger_than check fail";
     const char* err_msg = err.what();
@@ -54,15 +106,22 @@ TEST(OpRegistry, DefaultValue) {
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
-  paddle::framework::OpBase* op =
+  ASSERT_TRUE(op_desc.IsInitialized());
+
+  std::shared_ptr<paddle::framework::OperatorBase> op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  float default_value = 1.0;
-  std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  paddle::framework::Scope scope;
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
+  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
+}
+
+static void SetInputFormat(paddle::framework::OpDesc* desc) {
+  auto attr = desc->add_attrs();
+  attr->set_name("input_format");
+  attr->set_type(paddle::framework::INTS);
+  attr->mutable_ints()->Add(0);
+  attr->mutable_ints()->Add(1);
 }
 
 TEST(OpRegistry, CustomChecker) {
@@ -70,13 +129,13 @@ TEST(OpRegistry, CustomChecker) {
   op_desc.set_type("my_test_op");
   op_desc.add_inputs("ii");
   op_desc.add_outputs("oo");
+  SetInputFormat(&op_desc);
 
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::framework::EnforceNotMet err) {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "Attribute 'test_attr' is required!";
     const char* err_msg = err.what();
@@ -93,9 +152,8 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
-        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::framework::EnforceNotMet err) {
+    paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "'test_attr' must be even!";
     const char* err_msg = err.what();
@@ -111,12 +169,43 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
-  paddle::framework::OpBase* op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
+  SetInputFormat(&op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::platform::CPUDeviceContext dev_ctx;
+  paddle::framework::Scope scope;
+  op->Run(scope, dev_ctx);
+  int test_attr = op->GetAttr<int>("test_attr");
+  ASSERT_EQ(test_attr, 4);
+}
+
+class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
   }
-}
\ No newline at end of file
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cb86e6be2be3624bf54ee28193ca5d4c7bafa0eb
--- /dev/null
+++ b/paddle/framework/operator.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+template <>
+Eigen::DefaultDevice& ExecutionContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
+  return *device_context_.get_eigen_device<Eigen::DefaultDevice>();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice&
+ExecutionContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return *device_context_.get_eigen_device<Eigen::GpuDevice>();
+}
+#endif
+
+const std::string& OperatorBase::Input(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr,
+                 "Input Output Indices could not be nullptr");
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("input_format") == 0) {
+    return inputs_.at((size_t)it->second);
+  } else {
+    const auto& input_format = GetAttr<std::vector<int>>("input_format");
+    int idx = input_format[it->second];
+    return inputs_.at((size_t)idx);
+  }
+}
+
+std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
+  auto input_format = GetAttr<std::vector<int>>("input_format");
+  auto offset = in_out_idxs_->at(name);
+  PADDLE_ENFORCE(input_format.at(static_cast<size_t>(offset) + 1) <=
+                     static_cast<int>(inputs_.size()),
+                 "Input Out Of Range");
+
+  return std::vector<std::string>{
+      inputs_.begin() + input_format.at(offset),
+      inputs_.begin() + input_format.at(offset + 1)};
+}
+
+const std::string& OperatorBase::Output(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("output_format") == 0) {
+    return outputs_.at((size_t)it->second);
+  } else {
+    const auto& output_format = GetAttr<std::vector<int>>("output_format");
+    int idx = output_format[it->second];
+    return outputs_.at((size_t)idx);
+  }
+}
+
+std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
+  auto output_format = GetAttr<std::vector<int>>("output_format");
+  auto offset = in_out_idxs_->at(name);
+  PADDLE_ENFORCE(output_format.at(static_cast<size_t>(offset) + 1) <=
+                     static_cast<int>(outputs_.size()),
+                 "Output Out of Range");
+  return std::vector<std::string>{
+      outputs_.begin() + output_format.at(offset),
+      outputs_.begin() + output_format.at(offset + 1)};
+}
+
+std::string OperatorBase::DebugString() const {
+  std::stringstream ss;
+  ss << "Op(" << type_ << "), inputs:(";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    ss << inputs_[i];
+    if (i != inputs_.size() - 1) {
+      ss << ", ";
+    }
+  }
+  ss << "), outputs:(";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    ss << outputs_[i];
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
+    }
+  }
+  ss << ").";
+  return ss.str();
+}
+
+void OperatorBase::Rename(const std::string& old_name,
+                          const std::string& new_name) {
+  std::replace(inputs_.begin(), inputs_.end(), old_name, new_name);
+  std::replace(outputs_.begin(), outputs_.end(), old_name, new_name);
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..55435103489ace11868eed61c38018d8ba357e65
--- /dev/null
+++ b/paddle/framework/operator.h
@@ -0,0 +1,323 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <boost/variant.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+namespace framework {
+
+class OperatorBase;
+class InferShapeContext;
+class ExecutionContext;
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
+
+  /// If a variable's name has a certain suffix, it means that the
+  /// variable is the gradient of another varibale.
+  /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
+  static std::string GRAD_VAR_SUFFIX() { return "@GRAD"; }
+
+  /// Variables with this suffix are supposed to be filled up with zeros.
+  static std::string ZERO_VAR_SUFFIX() { return "@ZERO"; }
+
+  virtual ~OperatorBase() {}
+
+  template <typename T>
+  inline const T& GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+  virtual std::string DebugString() const;
+
+  /// Init will be called after CreateOperator, you can put some initialization
+  /// logic here.
+  virtual void Init() {}
+
+  /// InferShape infer the size of Variables used by this Operator with
+  /// information inside scope
+  virtual void InferShape(const Scope& scope) const = 0;
+
+  /// Net will call this function to Run an op.
+  virtual void Run(const Scope& scope,
+                   const platform::DeviceContext& dev_ctx) const = 0;
+
+  virtual bool IsNetOp() const { return false; }
+
+  /// rename inputs outputs name
+  void Rename(const std::string& old_name, const std::string& new_name);
+
+  //! Get a input with argument's name described in `op_proto`
+  const std::string& Input(const std::string& name) const;
+
+  //! Get a input which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Inputs(const std::string& name) const;
+  //! Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  //! Get an output which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Outputs(const std::string& name) const;
+
+ public:
+  std::string type_;
+  // NOTE: in case of OpGrad, inputs_ contains:
+  // I (Inputs)
+  // O (Outputs)
+  // OG (Output Gradients)
+  std::vector<std::string> inputs_;
+  // NOTE: in case of OpGrad, outputs_ contains
+  // IG (Inputs Gradients)
+  std::vector<std::string> outputs_;
+  AttributeMap attrs_;
+  // store the arguments' offset described in op_desc.
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
+};
+
+class OperatorContext {
+ public:
+  OperatorContext(const OperatorBase* op, const Scope& scope)
+      : op_(*op), scope_(scope) {}
+
+  size_t InputSize() const { return op_.inputs_.size(); }
+
+  size_t OutputSize() const { return op_.outputs_.size(); }
+
+  const Variable* InputVar(const size_t index) const {
+    return scope_.FindVar(op_.inputs_.at(index));
+  }
+
+  Variable* OutputVar(const size_t index) const {
+    return scope_.FindVar(op_.outputs_.at(index));
+  }
+
+  const Variable* InputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Input(name));
+  }
+
+  Variable* OutputVar(const std::string& name) const {
+    return scope_.FindVar(op_.Output(name));
+  }
+
+  const std::vector<const Variable*> MultiInputVar(
+      const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    res.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
+    return res;
+  }
+
+  std::vector<const Variable*> MultiOutputVar(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    res.reserve(names.size());
+    std::transform(
+        names.begin(), names.end(), std::back_inserter(res),
+        [this](const std::string& name) { return scope_.FindVar(name); });
+    return res;
+  }
+
+  template <typename T>
+  const T* Input(const size_t index) const {
+    auto var = InputVar(index);
+    PADDLE_ENFORCE(var != nullptr, "Input(%d) should not be nullptr", index);
+    return &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const size_t index) const {
+    auto var = OutputVar(index);
+    PADDLE_ENFORCE(var != nullptr, "Output(%d) should not be nullptr", index);
+    return var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const T* Input(const std::string& name) const {
+    auto var = InputVar(name);
+    PADDLE_ENFORCE(var != nullptr, "Input(%s) should not be nullptr", name);
+    return &var->Get<T>();
+  }
+
+  template <typename T>
+  T* Output(const std::string& name) const {
+    auto var = OutputVar(name);
+    PADDLE_ENFORCE(var != nullptr, "Output(%s) should not be nullptr", name);
+    return var->GetMutable<T>();
+  }
+
+  template <typename T>
+  const std::vector<const T*> MultiInput(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE(var != nullptr,
+                                    "MultiInput(%s:%s) should not be nullptr",
+                                    name, sub_name);
+                     return &var->Get<T>();
+                   });
+    return res;
+  }
+
+  template <typename T>
+  std::vector<const T*> MultiOutput(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const T*> res;
+    res.reserve(names.size());
+    std::transform(names.begin(), names.end(), std::back_inserter(res),
+                   [&](const std::string& sub_name) {
+                     auto var = scope_.FindVar(sub_name);
+                     PADDLE_ENFORCE(var != nullptr,
+                                    "MultiOutput(%s:%s) should not be nullptr",
+                                    name, sub_name);
+                     return var->GetMutable<T>();
+                   });
+    return res;
+  }
+
+  const OperatorBase& op_;
+  const Scope& scope_;
+};
+
+class InferShapeContext : public OperatorContext {
+ public:
+  InferShapeContext(const OperatorBase* op, const Scope& scope)
+      : OperatorContext(op, scope) {}
+};
+
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
+class ExecutionContext : public OperatorContext {
+ public:
+  ExecutionContext(const OperatorBase* op, const Scope& scope,
+                   const platform::DeviceContext& device_context)
+      : OperatorContext(op, scope), device_context_(device_context) {}
+
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType& GetEigenDevice() const;
+
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+
+  const platform::DeviceContext& device_context_;
+};
+
+class OpKernel {
+ public:
+  /**
+   * ExecutionContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * ExecutionContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const ExecutionContext& context) const = 0;
+
+  virtual ~OpKernel() {}
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
+  struct OpKernelKey {
+    platform::Place place_;
+
+    OpKernelKey() = default;
+    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+      place_ = dev_ctx.GetPlace();
+    }
+
+    bool operator==(const OpKernelKey& o) const {
+      return platform::places_are_same_class(place_, o.place_);
+    }
+  };
+
+  struct OpKernelHash {
+    std::hash<bool> hash_;
+    size_t operator()(const OpKernelKey& key) const {
+      return hash_(platform::is_gpu_place(key.place_));
+    }
+  };
+
+  using OpKernelMap =
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+
+  void InferShape(const Scope& scope) const {
+    InferShape(InferShapeContext(this, scope));
+  }
+
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const final {
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
+    opKernel->Compute(ExecutionContext(this, scope, dev_ctx));
+  }
+
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  }
+
+ protected:
+  virtual void InferShape(const InferShapeContext& ctx) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6a6a802b7da05c37a317540030836baa28a89cd7
--- /dev/null
+++ b/paddle/framework/operator_test.cc
@@ -0,0 +1,263 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/operator.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    op_run_num++;
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
+    ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope.FindVar(outputs_[0]), nullptr);
+  }
+
+ public:
+  float x = 0;
+};
+
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op");
+    AddComment("This is test op");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+
+TEST(OperatorBase, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUDeviceContext device_context;
+  paddle::framework::Scope scope;
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope.NewVar("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->InferShape(scope);
+  op->Run(scope, device_context);
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
+}
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+static int cpu_kernel_run_num = 0;
+
+class OpWithKernelTest : public OperatorWithKernel {
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {}
+};
+
+template <typename T1, typename T2>
+class CPUKernelTest : public OpKernel {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    std::cout << "this is cpu kernel" << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
+  }
+};
+
+// multiple inputs test
+class OperatorMultiInputsTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const Scope& scope) const override {}
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ASSERT_EQ(scope.FindVar(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope.FindVar(outputs_[0]), nullptr);
+    ASSERT_EQ(Input("x"), "IN1");
+    ASSERT_EQ(Input("y"), "OUT1");
+  }
+
+ public:
+  float x = 0;
+};
+
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("xs", "inputs of test op").SetMultiple();
+    AddInput("k", "input of test op");
+    AddOutput("ys", "outputs of test op").SetMultiple();
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const ExecutionContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+
+    auto inVar0 = ctx.MultiInputVar("xs");
+    ASSERT_EQ(inVar0.size(), 3);
+
+    auto intVar1 = ctx.InputVar("k");
+    ASSERT_NE(intVar1, nullptr);
+
+    auto outVar0 = ctx.MultiOutputVar("ys");
+    ASSERT_EQ(outVar0.size(), 2);
+
+    auto inTensor0 = ctx.MultiInput<Tensor>("xs");
+    ASSERT_EQ(inTensor0.size(), 3);
+
+    auto intTensor1 = ctx.Input<Tensor>("k");
+    ASSERT_NE(intTensor1, nullptr);
+
+    auto outTensor0 = ctx.MultiOutput<Tensor>("ys");
+    ASSERT_EQ(outTensor0.size(), 2);
+
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+
+// test with single input
+TEST(OpKernel, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("op_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::framework::Scope scope;
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
+  op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
+
+REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "x0";
+  *op_desc.mutable_inputs()->Add() = "x1";
+  *op_desc.mutable_inputs()->Add() = "x2";
+  *op_desc.mutable_inputs()->Add() = "k0";
+  *op_desc.mutable_outputs()->Add() = "y0";
+  *op_desc.mutable_outputs()->Add() = "y1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  auto attr0 = op_desc.mutable_attrs()->Add();
+  attr0->set_name("input_format");
+  attr0->set_type(paddle::framework::AttrType::INTS);
+  auto input_format = attr0->mutable_ints();
+  input_format->Add(0);  // x0
+  input_format->Add(3);  // k
+  input_format->Add(4);  // end
+
+  auto attr1 = op_desc.mutable_attrs()->Add();
+  attr1->set_name("output_format");
+  attr1->set_type(paddle::framework::AttrType::INTS);
+  auto output_format = attr1->mutable_ints();
+  output_format->Add(0);  // y0
+  output_format->Add(2);  // y1
+
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  paddle::framework::Scope scope;
+  scope.NewVar("x0")->GetMutable<Tensor>();
+  scope.NewVar("x1")->GetMutable<Tensor>();
+  scope.NewVar("x2")->GetMutable<Tensor>();
+  scope.NewVar("k0")->GetMutable<Tensor>();
+  scope.NewVar("y0")->GetMutable<Tensor>();
+  scope.NewVar("y1")->GetMutable<Tensor>();
+
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op->Run(scope, cpu_device_context);
+}
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
new file mode 100644
index 0000000000000000000000000000000000000000..080b4ac621c1b8c0d4b4e7b26f394cf2be263894
--- /dev/null
+++ b/paddle/framework/scope.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "paddle/string/printf.h"
+
+namespace paddle {
+namespace framework {
+
+Scope::~Scope() {
+  DropKids();
+  for (auto& kv : vars_) delete kv.second;
+}
+
+Scope& Scope::NewScope() const {
+  kids_.push_back(new Scope(this));
+  return *kids_.back();
+}
+
+Variable* Scope::NewVar(const std::string& name) {
+  auto iter = vars_.find(name);
+  if (iter != vars_.end()) {
+    return iter->second;
+  }
+  Variable* v = new Variable();
+  vars_[name] = v;
+  v->name_ = &(vars_.find(name)->first);
+  return v;
+}
+
+Variable* Scope::NewVar() {
+  return NewVar(string::Sprintf("%p.%d", this, vars_.size()));
+}
+
+Variable* Scope::FindVar(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
+}
+
+const Scope* Scope::FindScope(const Variable* var) const {
+  for (auto& kv : vars_) {
+    if (kv.second == var) {
+      return this;
+    }
+  }
+  return (parent_ == nullptr) ? nullptr : parent_->FindScope(var);
+}
+void Scope::DropKids() {
+  for (Scope* s : kids_) delete s;
+  kids_.clear();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a4470f726fb0d59a82db29b3239c111ea1569c55..2ba3f8ed355b48800cfa4180e4e8a94f2c9958a9 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -14,15 +14,17 @@ limitations under the License. */
 
 #pragma once
 
+#include <list>
 #include <string>
 #include <unordered_map>
-#include <vector>
 
 #include "paddle/framework/variable.h"
 
 namespace paddle {
 namespace framework {
 
+class Scope;
+
 /**
  * @brief Scope that manage all variables.
  *
@@ -33,62 +35,42 @@ namespace framework {
  */
 class Scope {
  public:
-  /**
-   * @brief Initialize s Scope without parent.
-   */
   Scope() {}
+  ~Scope();
+
+  // Disable Copy, Assign, Move.
+  Scope(const Scope& other) = delete;
+  Scope& operator=(const Scope& other) = delete;
+  Scope(Scope&& other) = delete;
+
+  /// Create a sub-scope. Returns a reference other than a pointer so
+  /// to prevent from manual deletion.
+  /// Mark it to const because that new kid scope cannot change parent scope.
+  Scope& NewScope() const;
+
+  /// Create a variable with given name if it doesn't exist.
+  Variable* NewVar(const std::string& name);
 
-  /**
-   * @brief Initialize a Scope with parent.
-   */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
-
-  /**
-   * @brief Create Variable
-   *
-   * Create Variable in this Scope. Return the exist one if Variable already
-   * been created.
-   */
-  Variable* CreateVariable(const std::string& name) {
-    auto var = GetVariable(name);
-    if (var) {
-      return var;
-    } else {
-      vars_[name] = std::unique_ptr<Variable>(new Variable());
-      return GetVariable(name);
-    }
-  }
-
-  /**
-   * @brief Get Variable.
-   *
-   * Get Variable from this Scope, this function will recursive find Variable
-   * from it's parent scope. Return nullptr if not found.
-   */
-  Variable* GetVariable(const std::string& name) const {
-    auto it = vars_.find(name);
-    if (it != vars_.end()) {
-      return it->second.get();
-    } else if (parent_ != nullptr) {
-      return parent_->GetVariable(name);
-    } else {
-      return nullptr;
-    }
-  }
-
-  /**
-   * @brief If this scope has a Var named name.
-   *
-   * Find if there is a Variable in this scope and it's parent scope
-   */
-  bool HasVariable(const std::string& name) const {
-    return (vars_.find(name) != vars_.end() ||
-            (parent_ && parent_->HasVariable(name)));
-  }
+  /// Create a variable with a scope-unique name.
+  Variable* NewVar();
+
+  /// Find a variable in the scope or any of its ancestors.  Returns
+  /// nullptr if cannot find.
+  Variable* FindVar(const std::string& name) const;
+
+  /// Find the scope or an ancestor scope that contains the given variable.
+  const Scope* FindScope(const Variable* var) const;
+
+  /// Drop all kids scopes belonged to this scope.
+  void DropKids();
 
  private:
-  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  // Call Scope::NewScope for a sub-scope.
+  explicit Scope(Scope const* parent) : parent_(parent) {}
+
+  std::unordered_map<std::string, Variable*> vars_;
+  mutable std::list<Scope*> kids_;
+  Scope const* parent_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index df1afb200ce9e75c5b1e40f2da56667487ae3576..9d51e355b0f6336d2f875ff2d77266b261baf5ac 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -15,44 +15,42 @@ limitations under the License. */
 #include "paddle/framework/scope.h"
 #include "gtest/gtest.h"
 
-TEST(Scope, Create) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+using paddle::framework::Scope;
+using paddle::framework::Variable;
 
-  auto scope = std::make_shared<Scope>();
+TEST(Scope, VarsShadowing) {
+  Scope s;
+  Scope& ss1 = s.NewScope();
+  Scope& ss2 = s.NewScope();
 
-  Variable* var0 = scope->CreateVariable("");
-  EXPECT_NE(var0, nullptr);
+  Variable* v0 = s.NewVar("a");
+  Variable* v1 = ss1.NewVar("a");
 
-  /// GetVariable will return nullptr if not exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var1, nullptr);
+  EXPECT_NE(v0, v1);
 
-  /// CreateVariable will return one.
-  Variable* var2 = scope->CreateVariable("a");
-  EXPECT_NE(var2, nullptr);
+  EXPECT_EQ(v0, s.FindVar("a"));
+  EXPECT_EQ(v1, ss1.FindVar("a"));
+  EXPECT_EQ(v0, ss2.FindVar("a"));
+}
 
-  /// Get the created variable.
-  Variable* var3 = scope->GetVariable("a");
-  EXPECT_EQ(var2, var3);
+TEST(Scope, FindVar) {
+  Scope s;
+  Scope& ss = s.NewScope();
 
-  /// CreateVariable will just return the variable if it's
-  /// already exist.
-  Variable* var4 = scope->CreateVariable("a");
-  EXPECT_EQ(var4, var2);
-}
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_EQ(nullptr, ss.FindVar("a"));
 
-TEST(Scope, Parent) {
-  using paddle::framework::Scope;
-  using paddle::framework::Variable;
+  ss.NewVar("a");
 
-  auto parent_scope = std::make_shared<Scope>();
-  auto scope = std::make_shared<Scope>(parent_scope);
+  EXPECT_EQ(nullptr, s.FindVar("a"));
+  EXPECT_NE(nullptr, ss.FindVar("a"));
+}
 
-  Variable* var0 = parent_scope->CreateVariable("a");
-  EXPECT_NE(var0, nullptr);
+TEST(Scope, FindScope) {
+  Scope s;
+  Scope& ss = s.NewScope();
+  Variable* v = s.NewVar("a");
 
-  /// GetVariable will get Variable from parent scope if exist.
-  Variable* var1 = scope->GetVariable("a");
-  EXPECT_EQ(var0, var1);
+  EXPECT_EQ(&s, s.FindScope(v));
+  EXPECT_EQ(&s, ss.FindScope(v));
 }
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ea7b2a1f7b17d9abc2c2e14de5ecd1cf4a1a5027
--- /dev/null
+++ b/paddle/framework/tensor.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index ce5d98b04e6b53fcedc4fc4610d9390e64846b2a..85af0e20a4174344716452bc03dcb1d5e596fe8d 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -14,85 +14,157 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdint>
+#include <cstring>
 #include <memory>
-#include <type_traits>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
-#include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
+
 namespace framework {
 
 class Tensor {
  public:
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
+
+  template <typename T, size_t D, int MajorType, typename IndexType>
+  friend struct EigenTensor;
+
+  template <typename T, int MajorType, typename IndexType>
+  friend struct EigenVector;
+
+ public:
+  Tensor() : offset_(0) {}
+
+  /*! Return a pointer to mutable memory block. */
   template <typename T>
-  const T* data() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tensor::data must be called after Tensor::mutable_data.");
-    return static_cast<const T*>(holder_->Ptr());
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
-    if (holder_ == nullptr ||
-        !(holder_->Place() ==
-          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T)) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
-    }
-    return static_cast<T*>(holder_->Ptr());
-  }
+  inline T* data();
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims) {
-    return mutable_data<T>(dims, paddle::platform::get_place());
-  }
+  /*! Return a pointer to constant memory block. */
+  template <typename T>
+  inline const T* data() const;
+
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
+
+  /*! Resize the dimensions of the memory block. */
+  inline void Resize(const DDim& dims);
+
+  /*! The internal of two tensors share the same memory block. */
+  template <typename T>
+  inline void ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief   Copy the content of external tensor to a new place.
+   *
+   * @param[in] src   The external tensor.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+   */
+  template <typename T>
+  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);
+
+  /**
+   * @brief   Return the slice of the tensor.
+   *
+   * @param[in] begin_idx   The begin index of the slice.
+   * @param[in] end_idx     The end index of the slice.
+   */
+  template <typename T>
+  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
  private:
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
+  template <typename T>
+  inline void check_memory_size() const;
+
+ private:
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
   struct Placeholder {
     virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
-    virtual size_t Size() const = 0;
+    virtual void* ptr() const = 0;
+    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
   };
 
-  template <typename T>
+  template <typename T, typename Place>
   struct PlaceholderImpl : public Placeholder {
-   private:
-    class Deleter {
-     public:
-      Deleter(platform::Place place) : place_(place) {}
-      void operator()(T* ptr) {
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
-
-     private:
-      paddle::platform::Place place_;
-    };
-
-   public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
-               Deleter(place)),
+    PlaceholderImpl(Place place, size_t size)
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
+               memory::PODDeleter<T, Place>(place)),
           place_(place),
-          size_(size) {}
+          size_(size) {
+      PADDLE_ENFORCE(ptr_ != nullptr, "Insufficient %s memory to allocation.",
+                     is_cpu_place(place_) ? "CPU" : "GPU");
+    }
+
+    virtual size_t size() const { return size_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
+
+    /*! the pointer of memory block. */
+    std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
 
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
+    /*! the place of memory block. */
+    platform::Place place_;
 
-    std::unique_ptr<T, Deleter> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    /*! the size of memory block. */
+    size_t size_;
   };
 
-  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
+  DDim dims_;
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
+  size_t offset_;
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#include "paddle/framework/tensor_impl.h"
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..92621f8c18ec0d03160a23c462830d14272c7f64
--- /dev/null
+++ b/paddle/framework/tensor_impl.h
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE(holder_ != nullptr,
+                 "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                 "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                 "first to re-allocate memory.");
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size<T>();
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size<T>();
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  PADDLE_ENFORCE(product(dims_) > 0,
+                 "Tensor's numel must be larger than zero to call "
+                 "Tensor::mutable_data. Call Tensor::set_dim first.");
+  /* some versions of boost::variant don't have operator!= */
+  size_t size = product(dims_) * sizeof(T);
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size));
+    } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+      PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+    }
+#else
+      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline void Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size<T>();
+  *this = src;
+}
+
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::Place& dst_place) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if (platform::is_gpu_place(src_place) &&
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  }
+
+#endif
+}
+
+template <typename T>
+inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+  check_memory_size<T>();
+  PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE(begin_idx < end_idx,
+                 "Begin index must be less than end index.");
+  PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+  int base = product(dims_) / dims_[0];
+  Tensor dst;
+  dst.holder_ = holder_;
+  DDim dst_dims = dims_;
+  dst_dims[0] = end_idx - begin_idx;
+  dst.Resize(dst_dims);
+  dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+  return dst;
+}
+
+inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 727d81f8d72e39ec564c42a48bf7ff64204adfff..ef1cc10b840896d9ab97f963fc12a4971cd74e1f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,15 +15,28 @@
 #include <gtest/gtest.h>
 #include <string>
 
-TEST(Tensor, ASSERT) {
-  paddle::framework::Tensor cpu_tensor;
+TEST(Tensor, Dims) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor tt;
+  tt.Resize(make_ddim({2, 3, 4}));
+  DDim dims = tt.dims();
+  ASSERT_EQ(arity(dims), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(i + 2, dims[i]);
+  }
+}
+
+TEST(Tensor, DataAssert) {
+  paddle::framework::Tensor src_tensor;
 
   bool caught = false;
   try {
-    const double* p __attribute__((unused)) = cpu_tensor.data<double>();
-  } catch (paddle::framework::EnforceNotMet err) {
+    src_tensor.data<double>();
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
-    std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
+    std::string msg =
+        "Tenosr holds no memory. Call Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -32,54 +45,218 @@ TEST(Tensor, ASSERT) {
   ASSERT_TRUE(caught);
 }
 
-/* mutable_data() is not tested at present
+/* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
-
+*/
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   {
-    Tensor cpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set cpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set cpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
     EXPECT_EQ(p1, p2);
-    // set cpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
 
+#ifndef PADDLE_ONLY_CPU
   {
-    Tensor gpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set gpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set gpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
     EXPECT_EQ(p1, p2);
-    // set gpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
     EXPECT_EQ(p1, p2);
   }
+#endif
+}
+
+TEST(Tensor, ShareDataWith) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    // Try to share data form uninitialized tensor
+    bool caught = false;
+    try {
+      dst_tensor.ShareDataWith<float>(src_tensor);
+    } catch (paddle::platform::EnforceNotMet err) {
+      caught = true;
+      std::string msg =
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
+      }
+    }
+    ASSERT_TRUE(caught);
+
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    dst_tensor.ShareDataWith<int>(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    dst_tensor.ShareDataWith<int>(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+}
+
+TEST(Tensor, Slice) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 3);
+    EXPECT_EQ(slice_dims[0], 2);
+    EXPECT_EQ(slice_dims[1], 3);
+    EXPECT_EQ(slice_dims[2], 4);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
+  }
+
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+}
+
+TEST(Tensor, CopyFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    auto cpu_place = new paddle::platform::CPUPlace();
+    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
+
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new paddle::platform::GPUPlace(0);
+    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new paddle::platform::CPUPlace();
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Tensors
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
+
+    // GPU Tensor to CPU Tensor
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Slice Tensors
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
 }
-*/
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index 72c4a7a2a1d1cf93a784f24e687727ee8481484c..38fc2720a3023039aa113b32a394bda9c5def4c0 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -16,7 +16,7 @@
 #include <typeindex>
 #include <typeinfo>
 
-#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"
 
 namespace paddle {
 namespace framework {
@@ -25,7 +25,7 @@ class Variable {
  public:
   template <typename T>
   const T& Get() const {
-    PADDLE_ASSERT(IsType<T>());
+    PADDLE_ENFORCE(IsType<T>(), "Variable must be type %s", typeid(T).name());
     return *static_cast<const T*>(holder_->Ptr());
   }
 
@@ -65,6 +65,17 @@ class Variable {
 
   std::unique_ptr<Placeholder>
       holder_;  // pointers to a PlaceholderImpl object indeed.
+
+  // name_ is only meaningful with a Scope and accessible by it.
+  //
+  // NOTE: Please don't expose name_ by adding methods like
+  // Variable::Name or Scope::VarName!  A variable could have a human
+  // readable name or an auto-generated scope-unique name.  In the
+  // former case, the caller knows the name and doesn't need to access
+  // the name; in the latter case, the variable should be identified
+  // by its address but not the unreadable name.
+  friend class Scope;
+  const std::string* name_;
 };
 
 }  // namespace framework
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a89b6bba45843d81264819cad6ba053f28314f6b
--- /dev/null
+++ b/paddle/function/BlockExpandOp.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
+ *
+ * Arguments in forward function:
+ * \param inputs[0]  Image data of NCHW format.
+ * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
+ */
+class BlockExpandFunction : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    blocks_ = config.get<std::vector<size_t>>("blocks");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 1;
+  }
+
+  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) const {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight =
+        1 +
+        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
+    size_t outputWidth =
+        1 +
+        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
+    CHECK_EQ(seqLength, outputHeight * outputWidth);
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
+
+    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class BlockExpandForward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
+    Im2ColFunctor<kOCF, Device, real> im2col;
+    for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [outputHeight, outputWidth,
+      // inputChannels, filterHeight, filterWidth], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
+      im2col(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+template <DeviceType Device>
+class BlockExpandBackward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e4897e72ba9fab2dd9e25d90313dc1b4d38e2d4
--- /dev/null
+++ b/paddle/function/BlockExpandOpTest.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(BlockExpandForward, real) {
+  for (size_t batchSize : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t inputHeight : {5, 33, 100}) {
+        for (size_t inputWidth : {5, 32, 96}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpand",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(BlockExpandBackward, real) {
+  for (size_t batchSize : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t inputHeight : {5, 33, 100}) {
+        for (size_t inputWidth : {5, 32, 96}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpandGrad",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
+                                ADD_TO);
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1518a8a654cfb54376a49760dc5873733c916937..93304f73037690b5cf3ac8189aabc28f51316a77 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
   list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
   if(WITH_TESTING)
     add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
@@ -37,9 +36,12 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(BlockExpandOpTest)
+    add_simple_unittest(CropOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)
+add_simple_unittest(Im2ColTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ConvOp.h b/paddle/function/ConvOp.h
index bb4f48364b9b454af7d37fe4d3c340666e53285c..baf78bc6c88d0d294f4457b81c52b22e425d9fdb 100644
--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
@@ -109,6 +109,13 @@ protected:
     return filter[filter.ndims() - 1];
   }
 
+  // determine whether im2col needs to be performed
+  inline bool isNeedIm2col(const TensorShape& filter) const {
+    return !(getFilterHeight(filter) == 1 && getFilterWidth(filter) == 1 &&
+             strideH() == 1 && strideW() == 1 && paddingH() == 0 &&
+             paddingW() == 0);
+  }
+
   std::vector<size_t> strides_;
   std::vector<size_t> paddings_;
 
diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
index dfa2f784610b0dd60340e0ebc6a066437f3715eb..7f32c734791853a8cd0287a80a7955dbd1bd7571 100644
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -31,13 +31,22 @@ public:
   ConvolutionTest(const std::string& conv1,
                   const std::string& conv2,
                   TestType type,
+                  bool useGroups = true,
                   std::string algo = "auto") {
     for (size_t batchSize : {1, 32}) {
       for (size_t inputSize : {7, 14, 54}) {
         for (size_t filterSize : {1, 3, 5}) {
           for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
+            for (size_t outputChannels : {3, 64}) {
+              if (inputChannels > outputChannels) break;
+              size_t groups;
+              if (!useGroups) {
+                groups = 1;
+              } else {
+                if (outputChannels % inputChannels != 0) continue;
+                groups = inputChannels;
+              }
+
               for (size_t stride : {1, 2}) {
                 for (size_t padding : {0, 1}) {
                   if (padding >= filterSize) break;
@@ -62,13 +71,24 @@ public:
                       FuncConfig()
                           .set("paddings", paddings)
                           .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                           .set("algo", algo));
 
                   TensorShape input{
                       batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
+
+                  TensorShape filter;
+                  if (groups > 1)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          inputChannels / groups,
+                                          filterSize,
+                                          filterSize});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterSize,
+                                          filterSize});
                   TensorShape output{
                       batchSize, outputChannels, outputSize, outputSize};
 
@@ -85,7 +105,8 @@ public:
                   } else if (type == kBackwardFilterTest) {
                     test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
                     test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
+                                    ADD_TO);
                     test.run();
                   }
                 }
@@ -106,6 +127,7 @@ public:
   ConvolutionTest2(const std::string& conv1,
                    const std::string& conv2,
                    TestType type,
+                   bool useGroups = true,
                    std::string algo = "auto") {
     for (size_t batchSize : {16}) {
       for (size_t inputHeight : {7, 31}) {
@@ -113,7 +135,15 @@ public:
           for (size_t filterHeight : {1, 5}) {
             for (size_t filterWidth : {3, 7}) {
               for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {32}) {
+                for (size_t outputChannels : {7}) {
+                  size_t groups;
+                  if (!useGroups) {
+                    groups = 1;
+                  } else {
+                    if (outputChannels % inputChannels != 0) continue;
+                    groups = inputChannels;
+                  }
+
                   size_t stride = 1;
                   size_t padding = 0;
                   size_t outputHeight =
@@ -141,13 +171,24 @@ public:
                       FuncConfig()
                           .set("paddings", paddings)
                           .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                           .set("algo", algo));
 
                   TensorShape input{
                       batchSize, inputChannels, inputHeight, inputWidth};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterHeight, filterWidth};
+
+                  TensorShape filter;
+                  if (groups > 1)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          inputChannels / groups,
+                                          filterHeight,
+                                          filterWidth});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterHeight,
+                                          filterWidth});
                   TensorShape output{
                       batchSize, outputChannels, outputHeight, outputWidth};
 
@@ -164,7 +205,8 @@ public:
                   } else if (type == kBackwardFilterTest) {
                     test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
                     test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
+                                    ADD_TO);
                     test.run();
                   }
                 }
@@ -177,34 +219,88 @@ public:
   }
 };
 
+// ======Start Convolution TEST======
+
 TEST(Forward, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
 }
 
 #ifndef PADDLE_ONLY_CPU
 TEST(Forward, GEMM2) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
 }
 
 TEST(BackwardInput, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      kBackwardInputTest,
+      false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      kBackwardInputTest,
+      false);
 }
 
 TEST(BackwardFilter, GEMM) {
   ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      kBackwardFilterTest,
+      false);
   ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      kBackwardFilterTest,
+      false);
 }
 #endif
+// ======End Convolution TEST======
+
+// ======Start DepthwiseConvolution TEST======
+
+// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
+// version of depthwiseConv is implemented.
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(DepthwiseConvForward, GEMM2) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(DepthwiseConvBackwardInput, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradInput-CPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradInput-CPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+}
+
+TEST(DepthwiseConvBackwardFilter, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradFilter-CPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradFilter-CPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+}
+
+#endif
+// ======End DepthwiseConvolution TEST======
 
 }  // namespace paddle
diff --git a/paddle/function/CropOp.cpp b/paddle/function/CropOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f12ee43e3d72f9ac776eaff93914228850694dd2
--- /dev/null
+++ b/paddle/function/CropOp.cpp
@@ -0,0 +1,177 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropOp.h"
+#include "paddle/function/TensorShape.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Crop<DEVICE_TYPE_CPU>(real* outputs,
+                           const real* inputs,
+                           const TensorShape inShape,
+                           const TensorShape outShape,
+                           const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < outC; c++) {
+      for (int h = 0; h < outH; h++) {
+        int outoff = ((n * outC + c) * outH + h) * outW;
+        int inoff = ((n * inC + c + cCrop) * inH + h + hCrop) * inW + wCrop;
+        memcpy(outputs + outoff, inputs + inoff, outW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_CPU>(const real* inGrad,
+                               real* outGrad,
+                               const TensorShape inShape,
+                               const TensorShape outShape,
+                               const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+      conf.get<std::vector<uint32_t>>("crop_corner");
+  int cCrop = crop_corner[1];
+  int hCrop = crop_corner[2];
+  int wCrop = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  for (int n = 0; n < num; n++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int outoff = ((n * outC + c + cCrop) * outH + h + hCrop) * outW + wCrop;
+        int inoff = ((n * inC + c) * inH + h) * inW;
+        CpuVector inG = CpuVector(inW, const_cast<real*>(inGrad + inoff));
+        CpuVector outG = CpuVector(inW, outGrad + outoff);
+        outG += inG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Crop input according to the specify corner and shape.
+ *        The input and output is a 4D tensor. In CropFunc, we only
+ *        crop the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the cropping corner and shape.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after cropping.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ *
+ * pad_: if corner = (0,1,1) and crop_shape = (2,1,2)
+ * Output(2,2,1,2) = [
+ *                    [ [[4,5]],
+ *                      [[6,7]] ],
+ *                    [ [[8,7]],
+ *                      [[3,5]] ]
+ *                  ] # the input shape is (2,2,2,3)
+ */
+template <DeviceType Device>
+class CropFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    TensorShape inShape = inputs[0].shape();
+    TensorShape outShape = outputs[0].shape();
+
+    Crop<Device>(outputs[0].data<real>(),
+                 inputs[0].data<real>(),
+                 inShape,
+                 outShape,
+                 conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+/**
+ * \brief The backward propagation of cropping Function.
+ *
+ * Argument in this Function:
+ * \param crop_    The same meaning as it in CropFunc.
+ * \param inputs  The gradient with respect to the output value of CropFunc.
+ * \param outputs The gradient with respect to the input value of CropFunc.
+ */
+
+template <DeviceType Device>
+class CropGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { conf_ = config; }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    TensorShape outShape = outputs[0].shape();
+    TensorShape inShape = inputs[0].shape();
+
+    CropGrad<Device>(inputs[0].data<real>(),
+                     outputs[0].data<real>(),
+                     inShape,
+                     outShape,
+                     conf_);
+  }
+
+private:
+  FuncConfig conf_;
+};
+
+REGISTER_TYPED_FUNC(Crop, CPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, CPU, CropGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(Crop, GPU, CropFunc);
+REGISTER_TYPED_FUNC(CropGrad, GPU, CropGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/CropOp.h b/paddle/function/CropOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..87986fbdc7e33aeb24d947e82a5d67ba23f532de
--- /dev/null
+++ b/paddle/function/CropOp.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief  This funtion crops inputs according to the specify start point and
+ *shape.
+ *
+ * \param[out] outputs	save results.
+ * \param[in]  inputs	input data.
+ * \param[in]  inShape  the shape of input tensor.
+ * \param[in]  conf     the cropping config
+ */
+template <DeviceType Device>
+void Crop(real* outputs,
+          const real* inputs,
+          const TensorShape inShape,
+          const TensorShape outShape,
+          const FuncConfig& conf);
+
+/**
+ * \brief   Cropping operation backward.
+ *
+ * \param[out] inGrad	gradients of previous layer
+ * \param[in]  outGrad  output gradient
+ * \param[in]  inShape  the shape of input tensor.
+ * \param[in]  conf     the cropping config
+ */
+template <DeviceType Device>
+void CropGrad(const real* inGrad,
+              real* outGrad,
+              const TensorShape inShape,
+              const TensorShape outShape,
+              const FuncConfig& conf);
+}  // namespace paddle
diff --git a/paddle/function/CropOpGpu.cu b/paddle/function/CropOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..786eb268d45aadee0c1f6fcbbafc23173cf0bc77
--- /dev/null
+++ b/paddle/function/CropOpGpu.cu
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "CropOp.h"
+
+namespace paddle {
+
+__global__ void KeCrop(real* outputs, const real* inputs,
+                      int inC, int inH, int inW,
+                      int cropC, int cropH, int cropW,
+                      int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % outW;
+    const int h = (idx / outW) % outH;
+    const int c = (idx / outW / outH) % outC;
+    const int n = idx / outW / outH / outC;
+
+    const int off = ((n * inC + c + cropC) * inH + h + cropH) * inW + cropW + w;
+    outputs[idx] = inputs[off];
+  }
+}
+
+template <>
+void Crop<DEVICE_TYPE_GPU>(real* outputs,
+                          const real* inputs,
+                          const TensorShape inShape,
+                          const TensorShape outShape,
+                          const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+        conf.get<std::vector<uint32_t>>("crop_corner");
+  int cropC = crop_corner[1];
+  int cropH = crop_corner[2];
+  int cropW = crop_corner[3];
+
+  int num = inShape[0];
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  size_t nth = num * outC * outH * outW;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeCrop<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (outputs, inputs, inC, inH, inW, cropC, cropH, cropW,
+     outC, outH, outW, nth);
+  CHECK_SYNC("Crop");
+}
+
+__global__ void KeCropDiff(const real* inGrad, real* outGrad,
+                          int inC, int inH, int inW,
+                          int cropC, int cropH, int cropW,
+                          int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off =
+        ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
+
+    outGrad[off] += inGrad[idx];
+  }
+}
+
+template <>
+void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
+                              real* outGrad,
+                              const TensorShape inShape,
+                              const TensorShape outShape,
+                              const FuncConfig& conf) {
+  std::vector<uint32_t> crop_corner =
+        conf.get<std::vector<uint32_t>>("crop_corner");
+  int cropC = crop_corner[1];
+  int cropH = crop_corner[2];
+  int cropW = crop_corner[3];
+
+  int num = outShape[0];
+  int outC = outShape[1];
+  int outH = outShape[2];
+  int outW = outShape[3];
+
+  int inC = inShape[1];
+  int inH = inShape[2];
+  int inW = inShape[3];
+
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + blockSize - 1) / blockSize;
+
+  KeCropDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (inGrad, outGrad, inC, inH, inW, cropC, cropH, cropW,
+     outC, outH, outW, nth);
+  CHECK_SYNC("CropGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CropOpTest.cpp b/paddle/function/CropOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6f11abfdf6f752857e0a75c62fb2b5c089c206d9
--- /dev/null
+++ b/paddle/function/CropOpTest.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Crop, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {5, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          for (bool test_grad : {false, true}) {
+            CpuGpuFuncCompare compare(
+                test_grad ? "CropGrad" : "Crop",
+                FuncConfig()
+                    .set<std::vector<uint32_t>>("crop_corner", {0, 1, 1, 1})
+                    .set<std::vector<uint32_t>>("crop_shape", {0, 2, 3, 3}));
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{numSamples, 2, 3, 3};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT,
+                                         test_grad ? inDims : outDims,
+                                         test_grad ? ADD_TO : ASSIGN_TO),
+                               test_grad ? ADD_TO : ASSIGN_TO);
+            compare.run();
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.cpp b/paddle/function/DepthwiseConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..490e8d546cbd460217abe95f6291b13fa207faa9
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.cpp
@@ -0,0 +1,306 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "ConvOp.h"
+#include "GemmFunctor.h"
+
+namespace paddle {
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    // TODO(zhaolong) : cpu implementation of depthwise convolution
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+/*
+ * \brief Forward calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+
+    DepthwiseConvFunctor<Device, real> depthwiseConv;
+    depthwiseConv(inputData,
+                  filterData,
+                  batchSize,
+                  outputChannels,
+                  outputHeight,
+                  outputWidth,
+                  inputChannels,
+                  inputHeight,
+                  inputWidth,
+                  filterMultiplier,
+                  filterHeight,
+                  filterWidth,
+                  strideH(),
+                  strideW(),
+                  paddingH(),
+                  paddingW(),
+                  outputData);
+  }
+};
+
+/*
+ * \brief Backward input calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
+    depthwiseConvGradInput(outputGrad,
+                           filterData,
+                           batchSize,
+                           outputChannels,
+                           outputHeight,
+                           outputWidth,
+                           inputChannels,
+                           inputHeight,
+                           inputWidth,
+                           filterMultiplier,
+                           filterHeight,
+                           filterWidth,
+                           strideH(),
+                           strideW(),
+                           paddingH(),
+                           paddingW(),
+                           inputGrad);
+  }
+};
+
+/*
+ * \brief Backward filter calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    int size = outputChannels * filterHeight * filterWidth * outputHeight *
+               outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
+
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterMultiplier,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            filterGrad);
+  }
+};
+
+REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    CPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    CPU,
+                    DepthwiseConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    GPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    GPU,
+                    DepthwiseConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOp.h b/paddle/function/DepthwiseConvOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bf70e52f34626405b49571e023ac60926713eef
--- /dev/null
+++ b/paddle/function/DepthwiseConvOp.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorType.h"
+
+namespace paddle {
+
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of inputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradInputFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
+ * \param[out]  filterGrad        the grad data of filter.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradFilterFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpGpu.cu b/paddle/function/DepthwiseConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ede0d27aa82e7d71ff5bc33df110fec260e06463
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpGpu.cu
@@ -0,0 +1,342 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__
+void ConvolutionDepthwiseForward(const int nthreads,
+    const T* const inputData, const T* const filterData,
+    const int batchSize, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const outputData) {
+
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight)
+       && (w_in_start >= 0) && (w_in_end < inputWidth)) {
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_in = -paddingH + h_out * strideH + kh;
+                const int w_in = -paddingW + w_out * strideW + kw;
+                const int offset = ((batch * inputChannels + c_in)
+                    * inputHeight + h_in) * inputWidth + w_in;
+                value += (*weight) * inputData[offset];
+                ++weight;
+            }
+        }
+    } else {
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_in = -paddingH + h_out * strideH + kh;
+                const int w_in = -paddingW + w_out * strideW + kw;
+                if ((h_in >= 0) && (h_in < inputHeight)
+                   && (w_in >= 0) && (w_in < inputWidth)) {
+                    const int offset = ((batch * inputChannels + c_in)
+                        * inputHeight + h_in) * inputWidth + w_in;
+                    value += (*weight) * inputData[offset];
+                }
+                ++weight;
+            }
+       }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__
+void ConvolutionDepthwiseInputBackward(const int nthreads,
+    const T* const top_diff, const T* const weight_data,
+    const int num, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const bottom_diff) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH)/strideH;
+    h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW)/strideW;
+    w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start;
+         c_out < c_out_start + filterMultiplier; c_out ++) {
+        for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+            const int filter_h = h_in + paddingH - h_out * strideH;
+            for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+                const int filter_w = w_in + paddingW - w_out * strideW;
+                const int filter_offset = c_out * filterHeight * filterWidth
+                    + filter_h * filterWidth + filter_w;
+                const int top_diff_offset = ((batch * outputChannels + c_out) *
+                    outputHeight + h_out)* outputWidth + w_out;
+                value += top_diff[top_diff_offset] * weight_data[filter_offset];
+            }
+        }
+    }
+    bottom_diff[index] += value;
+   }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__
+void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
+    const T* const top_diff, const T* const inputData,
+    const int num, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const buffer_data) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh = (index / filterWidth / outputHeight / outputWidth)
+          % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight)
+          && (w_in >= 0) && (w_in < inputWidth)) {
+      const int c_out = index /
+            (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset = ((batch * outputChannels + c_out) *
+            outputHeight + h_out) * outputWidth + w_out;
+      const int bottom_offset = ((batch * inputChannels + c_in)
+            * inputHeight + h_in) * inputWidth + w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
+public:
+  void operator()(const T* inputData,
+            const T* filterData,
+            int batchSize,
+            int outputChannels,
+            int outputHeight,
+            int outputWidth,
+            int inputChannels,
+            int inputHeight,
+            int inputWidth,
+            int filterMultiplier,
+            int filterHeight,
+            int filterWidth,
+            int strideH,
+            int strideW,
+            int paddingH,
+            int paddingW,
+            T* outputData){
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+            outputSize,
+            inputData,
+            filterData,
+            batchSize,
+            outputChannels,
+            outputHeight,
+            outputWidth,
+            inputChannels,
+            inputHeight,
+            inputWidth,
+            filterMultiplier,
+            filterHeight,
+            filterWidth,
+            strideH,
+            strideW,
+            paddingH,
+            paddingW,
+            outputData);
+    }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
+public:
+  void operator()(const T* outputGrad,
+            const T* filterData,
+            int batchSize,
+            int outputChannels,
+            int outputHeight,
+            int outputWidth,
+            int inputChannels,
+            int inputHeight,
+            int inputWidth,
+            int filterMultiplier,
+            int filterHeight,
+            int filterWidth,
+            int strideH,
+            int strideW,
+            int paddingH,
+            int paddingW,
+            T* inputGrad){
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+
+    ConvolutionDepthwiseInputBackward<T>
+          // NOLINT_NEXT_LINE(whitespace/operators)
+        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+            inputSize,
+            outputGrad,
+            filterData,
+            batchSize,
+            outputChannels,
+            outputHeight,
+            outputWidth,
+            inputChannels,
+            inputHeight,
+            inputWidth,
+            filterMultiplier,
+            filterHeight,
+            filterWidth,
+            strideH,
+            strideW,
+            paddingH,
+            paddingW,
+            inputGrad);
+    }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                const T* inputData,
+                int batchSize,
+                int outputChannels,
+                int outputHeight,
+                int outputWidth,
+                int inputChannels,
+                int inputHeight,
+                int inputWidth,
+                int filterMultiplier,
+                int filterHeight,
+                int filterWidth,
+                int strideH,
+                int strideW,
+                int paddingH,
+                int paddingW,
+                T* colData,
+                T* filterGrad){
+        int colDataSize = outputChannels * filterHeight * filterWidth
+            * outputHeight * outputWidth;
+
+        size_t blocks = (colDataSize + 1024 -1) / 1024;
+        size_t blockX = 512;
+        size_t blockY = (blocks+512-1)/512;
+        dim3 threads(1024, 1);
+        dim3 grid(blockX, blockY);
+        BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+            1, filterGrad, false, true);
+
+        for (int i = 0; i < batchSize; i++) {
+            ConvolutionDepthwiseFilterBackward<T>
+                <<< grid, threads, 0, STREAM_DEFAULT >>>(
+                    i,
+                    colDataSize,
+                    outputGrad,
+                    inputData,
+                    batchSize,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    inputChannels,
+                    inputHeight,
+                    inputWidth,
+                    filterMultiplier,
+                    filterHeight,
+                    filterWidth,
+                    strideH,
+                    strideW,
+                    paddingH,
+                    paddingW,
+                    colData);
+            int K = outputHeight * outputWidth;
+            int M = colDataSize / K;
+
+            BaseMatrix colMatrix(M, K, colData, false, true);
+            filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+        }
+    }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index a40e5d9d2e76605525f0956445fc43c693933cf8..0ada4d70a0c7d13f9b5fb1a42eac07fc4c775a87 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -12,101 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "GemmConvOp.h"
+#include "ConvOp.h"
 #include "GemmFunctor.h"
+#include "Im2Col.h"
 #include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
 /*
  * \brief Forward calculation of convolution.
  */
@@ -117,8 +29,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -155,35 +66,45 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
 
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    Im2ColFunctor<Device, real> im2col;
+    Im2ColFunctor<kCFO, Device, real> im2col;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
 
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
-
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
         int M = outputChannels / groups_;
         int N = outputHeight * outputWidth;
         int K = inputChannels / groups_ * filterHeight * filterWidth;
@@ -217,8 +138,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& input = outputs[0].shape();
@@ -249,15 +169,28 @@ public:
     real* outputGrad = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* inputGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
 
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    Col2ImFunctor<Device, real> col2im;
+    Col2ImFunctor<kCFO, Device, real> col2im;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
@@ -267,6 +200,11 @@ public:
         int K = outputChannels / groups_;
         int N = outputHeight * outputWidth;
         int M = inputChannels / groups_ * filterHeight * filterWidth;
+        real scale = 0.0f;
+        if (!needIm2col) {
+          colData = inputGrad + g * inputOffset;
+          scale = 1.0f;
+        }
         gemm(CblasTrans,
              CblasNoTrans,
              M,
@@ -277,23 +215,19 @@ public:
              M,
              outputGrad + g * outputOffset,
              N,
-             0.0f,
+             scale,
              colData,
              N);
-
-        col2im(colData,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               inputGrad + g * inputOffset);
+        if (needIm2col) {
+          col2im(inputGrad + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        }
       }
       inputGrad += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;
@@ -311,8 +245,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
     const TensorShape& filter = outputs[0].shape();
@@ -347,34 +280,44 @@ public:
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();
     real* filterGrad = outputs[0].data<real>();
+    bool needIm2col = isNeedIm2col(filter);
+
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+
+    TensorShape colShape;
+    real* colData = NULL;
+
+    if (needIm2col) {
+      colShape = TensorShape({inputChannels / groups_,
+                              filterHeight,
+                              filterWidth,
+                              outputHeight,
+                              outputWidth});
+      resizeBuffer<Device>(colShape.getElements());
+      colData = reinterpret_cast<real*>(memory_->getBuf());
+    }
 
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
-    real* colData = reinterpret_cast<real*>(memory_->getBuf());
-
-    Im2ColFunctor<Device, real> im2col;
+    Im2ColFunctor<kCFO, Device, real> im2col;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
-        im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
-               strideH(),
-               strideW(),
-               paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
-
+        if (needIm2col) {
+          im2col(inputData + g * inputOffset,
+                 imShape,
+                 colData,
+                 colShape,
+                 strideH(),
+                 strideW(),
+                 paddingH(),
+                 paddingW());
+        } else {
+          colData = inputData + g * inputOffset;
+        }
         int M = outputChannels / groups_;
         int K = outputHeight * outputWidth;
         int N = inputChannels / groups_ * filterHeight * filterWidth;
diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
deleted file mode 100644
index 9f11cce597a07ce2a54f518be30b657c26ab7516..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOp.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <DeviceType Device, class T>
-class Im2ColFunctor {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData);
-};
-
-template <DeviceType Device, class T>
-class Col2ImFunctor {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData);
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu
deleted file mode 100644
index 2a1795ff0fb5643ea436c94fe893fe866056fccb..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOpGpu.cu
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmConvOp.h"
-
-namespace paddle {
-
-template<class T>
-__global__
-void im2col(const T* data_im, int numOuts, int height, int width,
-            int blockH, int blockW,
-            int strideH, int strideW,
-            int paddingH, int paddingW,
-            int height_col, int width_col,
-            T* data_col) {
-  int index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 -1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
-         strideHeight, strideWidth, paddingHeight, paddingWidth,
-         outputHeight, outputWidth, colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template<class T>
-__global__
-void col2im(size_t n, const T* data_col, size_t height,
-            size_t width, size_t channels,
-            size_t blockH, size_t blockW,
-            size_t strideH, size_t strideW,
-            size_t paddingH, size_t paddingW,
-            size_t height_col, size_t width_col,
-            T* data_im) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] += val;
-    }
-  }
-}
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
-        * (inputWidth + 2*paddingWidth);
-
-    size_t blocks = (numKernels + 1024 -1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (numKernels,
-              colData,
-              inputHeight + 2*paddingHeight,
-              inputWidth + 2*paddingWidth,
-              inputChannels,
-              filterHeight,
-              filterWidth,
-              strideHeight,
-              strideWidth,
-              paddingHeight,
-              paddingWidth,
-              outputHeight,
-              outputWidth,
-              imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
new file mode 100644
index 0000000000000000000000000000000000000000..48e2e32f9256fb49c67ba25e9b5a47d72499758b
--- /dev/null
+++ b/paddle/function/Im2Col.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorShape.h"
+#include "TensorType.h"
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [inputChannels, inputHeight, inputWidth].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * inputChannels * filterHeight * filterWidth, and the width is equal
+ * outputHeight * outputWidth.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [inputChannels,
+ *      filterHeight,
+ *      filterWidth,      ======>      [height, width]
+ *      outputHeight,
+ *      outputWidth]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
+ * is equal outputHeight * outputWidth, and the stepSize is equal
+ * inputChannels * filterHeight * filterWidth.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [outputHeight,
+ *      outputWidth,
+ *      inputChannels,    ======>    [seqLength, stepSize]
+ *      filterHeight,
+ *      filterWidth]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7d1eb1eded7a7471fd5833a649916d3ee3e598e
--- /dev/null
+++ b/paddle/function/Im2ColOp.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = float(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15ba854009636d027447d104071163100d5e3f4b
--- /dev/null
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -0,0 +1,381 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include "hl_device_functions.cuh"
+
+namespace paddle {
+
+template<class T>
+__global__
+void im2col(const T* data_im, int numOuts, int height, int width,
+            int blockH, int blockW,
+            int strideH, int strideW,
+            int paddingH, int paddingW,
+            int height_col, int width_col,
+            T* data_col) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 -1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+         strideHeight, strideWidth, paddingHeight, paddingWidth,
+         outputHeight, outputWidth, colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template<class T>
+__global__
+void col2im(size_t n, const T* data_col, size_t height,
+            size_t width, size_t channels,
+            size_t blockH, size_t blockW,
+            size_t strideH, size_t strideW,
+            size_t paddingH, size_t paddingW,
+            size_t height_col, size_t width_col,
+            T* data_im) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] += val;
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+        * (inputWidth + 2*paddingWidth);
+
+    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (numKernels,
+              colData,
+              inputHeight + 2*paddingHeight,
+              inputWidth + 2*paddingWidth,
+              inputChannels,
+              filterHeight,
+              filterWidth,
+              strideHeight,
+              strideWidth,
+              paddingHeight,
+              paddingWidth,
+              outputHeight,
+              outputWidth,
+              imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+
+template<class T>
+__global__
+void im2colOCF(const T* imData, T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth
+           + channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth
+          + channelId * filterHeight * filterWidth
+          + (shId * outputWidth + swId)
+          * (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= inputHeight || heightOffset < 0 ||
+            widthOffset >= inputWidth || widthOffset < 0) {
+          colData[colOffset] = T(0);
+        } else {
+          colData[colOffset] = imData[imOffset];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template<class T>
+__global__
+void col2imOCF(T* imData, const T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth
+           + channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth
+          + channelId * filterHeight * filterWidth
+          + (shId * outputWidth + swId)
+          * (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= 0 && heightOffset < inputHeight &&
+            widthOffset >= 0 && widthOffset < inputWidth) {
+          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    col2imOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..acc88a553abe7ac58b629aba9b850df58cee7f81
--- /dev/null
+++ b/paddle/function/Im2ColTest.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
+                  break;
+                if (padding >= filterHeight || padding >= filterWidth) break;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+
+                TensorShape imShape =
+                    TensorShape({channels, inputHeight, inputWidth});
+                TensorShape colShape1 = TensorShape({channels,
+                                                     filterHeight,
+                                                     filterWidth,
+                                                     outputHeight,
+                                                     outputWidth});
+                TensorShape colShape2 = TensorShape({outputHeight,
+                                                     outputWidth,
+                                                     channels,
+                                                     filterHeight,
+                                                     filterWidth});
+
+                size_t height = channels * filterHeight * filterWidth;
+                size_t width = outputHeight * outputWidth;
+                VectorPtr input1 = Vector::create(imShape.getElements(), false);
+                VectorPtr input2 = Vector::create(imShape.getElements(), false);
+                MatrixPtr output1 = Matrix::create(height, width, false, false);
+                MatrixPtr output2 = Matrix::create(width, height, false, false);
+                input1->uniform(0.001, 1);
+                input2->copyFrom(*input1);
+
+                Im2ColFunctor<kCFO, Device, T> im2Col1;
+                Im2ColFunctor<kOCF, Device, T> im2Col2;
+                im2Col1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                im2Col2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                // The transposition of the result of ColFormat == kCFO
+                // is equal to the result of ColFormat == kOCF.
+                MatrixPtr test;
+                output2->transpose(test, true);
+                autotest::TensorCheckErr(*output1, *test);
+
+                Col2ImFunctor<kCFO, Device, T> col2Im1;
+                Col2ImFunctor<kOCF, Device, T> col2Im2;
+                col2Im1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                col2Im2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                autotest::TensorCheckErr(*input1, *input2);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 4348f0f775e9442c50a3c45b9a8e6dad5c6b198d..e0692fa06d6e0c35cfa742ca3eac7fe2037b1a80 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
index c0b947e224313abaf4fadfb8293dc78ca085ff84..d9dcc7d59d1e3c222f5a7ce448daa8d7edb6c978 100644
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
       __syncthreads();
       if (tidy < (context - 1)) {
         yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
       }
       __syncthreads();
 
@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
       __syncthreads();
 
       for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
         __syncthreads();
 
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
     dim3 dimBlock(32, 32);
     dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
     real* dw = filterG.getData();
-    if (contextLength <= 32) { 
+    if (contextLength <= 32) {
       KeRowConvBwWeight<32, 32, 32>
         <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
         (dw, x, dy, starts, height, width, numSeq, contextLength);
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e8080c3d714b324f072a380f738b9764477dfe04..f0ec77a5d00333993427fb8d0bc938c884e50c95 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
             "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
              0,
@@ -58,18 +58,10 @@ public:
     workspaceBuffer_ = nullptr;
     workspaceSize_ = 0;
 
-    threadpool_ = nullptr;
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
+    create_nnpack_threadpool();
   }
 
   ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
     if (workspaceBuffer_) {
       free(workspaceBuffer_);
     }
@@ -225,14 +217,25 @@ public:
     }
   }
 
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
 private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
   size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
 
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index a40530f41313be27dc1c2606501c6c00bed11c8b..81cc3c890b6d4ad048e4edc03208c85778244078 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -207,8 +207,8 @@ Error __must_check backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    Error status = softmax_.backward(argument_);
-    if (!status) return status;
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
   }
   return Error();
 }
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 40036762179ebb1495b90907f16b97e3c60c50d8..265dbb54933540ff8b0d1e2e2d985b4b7fa51ecd 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
       hl_destroy_event(hlEvent_);
       hlEvent_ = NULL;
     }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
   }
 
   void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 2e839f640503b8f4e390fc87d9d59960dbc37f6e..cfa80a89365af5111746eec9599d16e37532a9f7 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
       : layerName_(layerName) {
     addEvaluator(std::move(evaluator));
   }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
     const LayerPtr& layer = nn.getLayer(layerName_);
     CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                  << nn.getName();
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 41e0929959bafa070190b0c3ab32bee0efc0d737..f98bf95064fa539b990309dfe0bff10c1e99d096 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
   }
   starts.back() = pos;
 }
-}
+}  // namespace
 
 void RecurrentGradientMachine::calcSequenceStartPositions() {
   std::vector<int> starts(commonSeqInfo_.size() + 1);
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 15e7411b5fde0fa3a532394cf7d0e8477ef052d0..bdae7e623ae0472d4fe5ef3a88fc1e93bbf1e52c 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
     dest[index[i]] = src[i];
   }
 }
-}
+}  // namespace
 
 void GatherAgentLayer::forwardIds(PassType passType) {
   IVectorPtr realId = realLayers_[0]->getOutputLabel();
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 2bafeb92158c56efe32f90742807f0af07bda5af..3b1f34635917290c6e4a9230b546892cc5cb7bfa 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,6 +37,22 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "BlockExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+  createFunction(backward_,
+                 "BlockExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+
   return true;
 }
 
@@ -63,48 +79,27 @@ void BlockExpandLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();
 
-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
   ICpuGpuVector::resizeOrCreate(
       out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    outVTrans_->zeroMem();
-    /* expand each block as one row */
-    MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(),
-                       1,
-                       input->getWidth(),
-                       false,
-                       useGpu_);
-    outVTrans_->convExpand(*inputTmp,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_);
-    MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
@@ -113,48 +108,13 @@ void BlockExpandLayer::forward(PassType passType) {
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  size_t blockNum = outputH_ * outputW_;
-  size_t blockSize = blockH_ * blockW_ * channels_;
   /* Calculate the input layers error */
-  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
-  if (!preGrad) {
-    return;
-  }
-  MatrixPtr grad = getOutputGrad();
-  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-  size_t batchSize = preGrad->getHeight();
-
-  CHECK_EQ(batchSize * blockNum, grad->getHeight());
-  CHECK_EQ(blockSize, grad->getWidth());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    gradTmp->transpose(gradTrans, false);
-    MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                       1,
-                       preGrad->getWidth(),
-                       false,
-                       useGpu_);
-    preGradTmp->convShrink(*gradTrans,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_,
-                           1.0,
-                           1.0);
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 8f347400e60ec84fc1b5fdbc1c911a8768b306d0..15ce73ab8b2ca16ba1e9329ed5c00dc7239e8b93 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,8 +50,8 @@ protected:
   size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
   size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
 
-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
+  TensorShape inputShape_;
+  TensorShape outputShape_;
 
 public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/ClipLayer.cpp b/paddle/gserver/layers/ClipLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..13f16c953793b82183237188b56eb61d76ecd2fd
--- /dev/null
+++ b/paddle/gserver/layers/ClipLayer.cpp
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for clipping the input value by the threshold.
+ * \f[
+ *   out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+ * \f]
+ */
+
+class ClipLayer : public Layer {
+protected:
+  double min_;
+  double max_;
+
+public:
+  explicit ClipLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(clip, ClipLayer);
+
+bool ClipLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+  auto layerConf = config_.inputs(0).clip_conf();
+  min_ = layerConf.min();
+  max_ = layerConf.max();
+  CHECK_LT(min_, max_);
+  return true;
+}
+
+void ClipLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+  resetOutput(inV->getHeight(), inV->getWidth());
+  MatrixPtr outV = getOutputValue();
+  outV->copyFrom(*inV);
+  outV->clip(min_, max_);
+}
+
+void ClipLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  if (inG) {
+    MatrixPtr outV = getOutputValue();
+    MatrixPtr outG = getOutputGrad();
+    MatrixPtr tmpMtx;
+    Matrix::resizeOrCreate(
+        tmpMtx, outG->getHeight(), outG->getWidth(), false, useGpu_);
+    tmpMtx->clipDerivative(*inV, min_, max_);
+    inG->addDotMul(*outG, *tmpMtx, 1, 1);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index d1e932ded595c90cbe6040c330c5c8663d81e2b4..eb6b0445c95a9e9a7acd5d693ecdb11a263f41fd 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -87,9 +87,6 @@ void ConvBaseProjection::initCudnn() {
   bwdDataLimitBytes_ = 0;
   bwdFilterLimitBytes_ = 0;
   workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
 }
 
 void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
@@ -142,32 +139,25 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(width, out_->value->getWidth());
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(imageDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+  reshapeTensorDesc(batchSize);
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
 }
 
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index 4a33aa1837dfc36dbead60deaccbc6b772fe4754..e9d9f8f1b2937b3a3b7323c43ef5608ffc5f82ca 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -101,12 +101,6 @@ protected:
   size_t bwdFilterLimitBytes_;
   /// Size of total work space.
   size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
   bool bias_;
 
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/gserver/layers/CropLayer.cpp b/paddle/gserver/layers/CropLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..69ad913420bdb6e1b2ed0618b7f9b78d7477be99
--- /dev/null
+++ b/paddle/gserver/layers/CropLayer.cpp
@@ -0,0 +1,146 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CropLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+
+REGISTER_LAYER(crop, CropLayer);
+
+bool CropLayer::init(const LayerMap& layerMap,
+                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_LE(static_cast<int>(inputLayers_.size()), 2);
+  CHECK_GE(static_cast<int>(inputLayers_.size()), 1);
+  crop_axis_ = config_.axis();
+  for (int i = 0; i < config_.offset_size(); i++) {
+    crop_offsets_.push_back(config_.offset(i));
+  }
+
+  // 1. get input_0 shape
+  auto& input0_img_conf = config_.inputs(0).image_conf();
+  inDims_ = TensorShape({0,
+                         input0_img_conf.channels(),
+                         input0_img_conf.has_img_size_y()
+                             ? input0_img_conf.img_size_y()
+                             : input0_img_conf.img_size(),
+                         input0_img_conf.img_size()});
+  // 2. get target dims from config
+  if (config_.inputs_size() == 1) {
+    targetDims_ = TensorShape({config_.shape(0),
+                               config_.shape(1),
+                               config_.shape(2),
+                               config_.shape(3)});
+  } else {
+    // 2. get input_1 shape
+    auto& input1_img_conf = config_.inputs(1).image_conf();
+    targetDims_ = TensorShape({0,
+                               input1_img_conf.channels(),
+                               input1_img_conf.has_img_size_y()
+                                   ? input1_img_conf.img_size_y()
+                                   : input1_img_conf.img_size(),
+                               input1_img_conf.img_size()});
+  }
+
+  // 3. get final crop corner
+  int dimSize = 4;
+  crop_corner_ = {0, 0, 0, 0};
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      if (crop_offsets_.size() > 1) {
+        crop_corner_[i] = crop_offsets_[i - crop_axis_];
+      } else {
+        crop_corner_[i] = crop_offsets_[0];
+      }
+    }
+  }
+
+  outDims_ = TensorShape(4);
+
+  createFunction(
+      forward_, "Crop", FuncConfig().set("crop_corner", crop_corner_));
+  createFunction(
+      backward_, "CropGrad", FuncConfig().set("crop_corner", crop_corner_));
+
+  return true;
+}
+
+void CropLayer::setOutDims() {
+  MatrixPtr input = inputLayers_[1]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  // get target dims from input_1
+  if (config_.inputs_size() == 2) {
+    targetDims_.setDim(0, batchSize);
+    int ch = config_.inputs(0).image_conf().channels();
+    if (ch != 0) targetDims_.setDim(1, ch);
+    int h = inputLayers_[1]->getOutput().getFrameHeight();
+    if (h != 0) targetDims_.setDim(2, h);
+    int w = inputLayers_[1]->getOutput().getFrameWidth();
+    if (w != 0) targetDims_.setDim(3, w);
+  }
+  // get final crop shape from target dims and crop axis
+  std::vector<uint32_t> crop_shape;
+  int dimSize = 4;
+  for (int i = 0; i < dimSize; i++) {
+    if (i >= crop_axis_) {
+      crop_shape.push_back(targetDims_[i]);
+    } else {
+      crop_shape.push_back(inDims_[i]);
+    }
+  }
+
+  outDims_.reshape(
+      {crop_shape[0], crop_shape[1], crop_shape[2], crop_shape[3]});
+  output_.setFrameHeight(crop_shape[2]);
+  output_.setFrameWidth(crop_shape[3]);
+}
+
+void CropLayer::setInDims() {
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+}
+
+void CropLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  setInDims();
+  setOutDims();
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(outDims_[0], size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("CropForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void CropLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("CropBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CropLayer.h b/paddle/gserver/layers/CropLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..6b6202621023575c1c83049ecbd019656c726e3f
--- /dev/null
+++ b/paddle/gserver/layers/CropLayer.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer crop input according to the specify conf.
+ *         input_0: input to be cropped
+ *         input_1: optional reference input
+ *         axis: start dimension to be croped
+ *         offset: offset of cropping  in each dimension
+ *         shape: if reference input layer was not setted,
+ *                  crop input as this shape conf
+ */
+class CropLayer : public Layer {
+public:
+  explicit CropLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~CropLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  void setOutDims();
+  void setInDims();
+
+  int32_t crop_axis_;
+  std::vector<uint32_t> crop_offsets_;
+  std::vector<uint32_t> crop_corner_;
+  TensorShape inDims_;
+  TensorShape targetDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index af79e65a7c09e5a1b55febf1df1e8f5bb61bdcb8..783e02e47cb91e28eb88b079f1e94439d34fa775 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -38,10 +38,25 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   inputShape_.resize(numInputs);
   filterShape_.resize(numInputs);
   outputShape_.resize(numInputs);
+
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
+
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
 
+    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
+    } else {
+      convType = "GemmConv";
+      convGradInputType = "GemmConvGradInput";
+      convGradFilterType = "GemmConvGradFilter";
+    }
+
     if (FLAGS_use_nnpack) {
       CHECK_EQ(isDeconv_, false);
       createFunction(forward_,
@@ -53,21 +68,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                          .set("algo", std::string("auto")));
     } else {
       createFunction(forward_,
-                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     !isDeconv_ ? convType : convGradInputType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
-                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     !isDeconv_ ? convGradInputType : convType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
                          .set("groups", (size_t)groups_[i]));
 
       createFunction(backward_,
-                     "GemmConvGradFilter",
+                     convGradFilterType,
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 4b92b5d163ad107c0783beae45f8c936112fcccf..d5621412caee843e24a0d0c9b7096402765738c7 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -359,12 +359,11 @@ void Layer::backwardActivation() {
   /* Do error clipping */
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
-      CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(
-          output_.grad->getData(), 0, output_.grad->getElementCnt());
-      real maxAbsGrad = outGradVec.getAbsMax();
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
         LOG(INFO) << " layer=" << config_.name() << " need clipping,"
                   << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
       }
diff --git a/paddle/gserver/layers/RowL2NormLayer.cpp b/paddle/gserver/layers/RowL2NormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d609be43b73a86d0d0f7b60be993836e2ea6fff
--- /dev/null
+++ b/paddle/gserver/layers/RowL2NormLayer.cpp
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * A layer for L2 normalization in each row,
+ * \f[
+ *   out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+ * \f]
+ * where the size of \f$in\f$ is (batchSize x dataDim),
+ * and the size of \f$out\f$ is (batchSize x dataDim).
+ */
+
+class RowL2NormLayer : public Layer {
+protected:
+  MatrixPtr inSquare_;
+  MatrixPtr l2NormReciprocal_;
+  MatrixPtr dotSum_;
+
+public:
+  explicit RowL2NormLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(row_l2_norm, RowL2NormLayer);
+
+bool RowL2NormLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1U);
+
+  return true;
+}
+
+void RowL2NormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr inV = getInputValue(0);
+
+  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = getSize();
+  CHECK_EQ(dataDim, inV->getWidth());
+  resetOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(inSquare_, batchSize, dataDim, false, useGpu_);
+  inV->square2(*inSquare_);
+  Matrix::resizeOrCreate(l2NormReciprocal_, batchSize, 1, false, useGpu_);
+  inSquare_->rowSum(*l2NormReciprocal_);
+  l2NormReciprocal_->sqrt2(*l2NormReciprocal_);
+  l2NormReciprocal_->scalarDiv(*l2NormReciprocal_, 1.0);
+  outV->rowScale(0, *inV, *l2NormReciprocal_);
+}
+
+void RowL2NormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr outV = getOutputValue();
+  MatrixPtr outG = getOutputGrad();
+  size_t batchSize = inV->getHeight();
+
+  // inG[ij] += outG[ij] / l2NormReciprocal
+  // inG[ij] += -inV[ij] * l2NormReciprocal * l2NormReciprocal * DotMul(outG[i],
+  // inV[i])
+  if (inG) {
+    Matrix::resizeOrCreate(dotSum_, batchSize, 1, false, useGpu_);
+    dotSum_->zeroMem();
+    dotSum_->rowDotMul(0, *outG, *outV);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    dotSum_->dotMul(*dotSum_, *l2NormReciprocal_);
+    inSquare_->rowScale(0, *inV, *dotSum_);
+    inG->sub(*inSquare_);
+    inG->addRowScale(0, *outG, *l2NormReciprocal_);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SliceProjection.cpp b/paddle/gserver/layers/SliceProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..267dd6154b1b21cc9b936384d438a2c3bdf0c246
--- /dev/null
+++ b/paddle/gserver/layers/SliceProjection.cpp
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * SliceProjection can slice the input value into multiple parts,
+ * and then select some of them to merge into a new output.
+ *
+ * First, calculate the slices that need to be merged into the output.
+ * slices = input.slices().for_output()
+ *
+ * Second, merge each slice into the output.
+ * for(auto slice: slices) {
+ *   out.addAtOffset(slice, offset);
+ * }
+ *
+ * Input slices as output: s0, s1, ...:
+ *   -----------------------
+ *   |///|   |//////|      |
+ *   |/s0|   |//s1//|      |
+ *   |///|   |//////|      |
+ *   -----------------------
+ * Output, merge s0, s1, ... into one output:
+ *   ----------------
+ *   |///|//////|   |
+ *   |/s0|//s1//|...|
+ *   |///|//////|   |
+ *   ----------------
+ *
+ * The config file api is slice_projection.
+ */
+class SliceProjection : public Projection {
+public:
+  SliceProjection(const ProjectionConfig& config,
+                  const ParameterPtr& parameter,
+                  bool useGpu);
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  std::vector<std::pair<size_t, size_t>> slices_;
+};
+
+REGISTER_PROJECTION(slice, SliceProjection);
+
+/**
+ * Constructed function.
+ * @note SliceProjection should not have any parameter.
+ */
+SliceProjection::SliceProjection(const ProjectionConfig& config,
+                                 const ParameterPtr& parameter,
+                                 bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(!parameter) << "'slice' projection should not have any parameter";
+
+  slices_.reserve(config.slices_size());
+  for (const auto& slice : config.slices()) {
+    slices_.push_back(std::make_pair(slice.start(), slice.end()));
+  }
+}
+
+void SliceProjection::forward() {
+  size_t offset = 0;
+  for (auto& slice : slices_) {
+    auto slice_out = in_->value->subColMatrix(slice.first, slice.second);
+    out_->value->addAtOffset(*slice_out, offset);
+    offset += slice_out->getWidth();
+  }
+}
+
+void SliceProjection::backward(const UpdateCallback& callback) {
+  if (in_->grad) {
+    size_t offset = 0;
+    for (auto& slice : slices_) {
+      auto slice_out = in_->grad->subColMatrix(slice.first, slice.second);
+      slice_out->addAtOffset(*out_->grad, offset);
+      offset += slice_out->getWidth();
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 92f6cbcfe5a0e23c5939b1689a3e339367450387..a43adc7ce7db937bd62ea9bf1533b8a5899c259a 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -56,7 +56,7 @@ add_test(NAME test_DetectionOutput
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
     LayerGradUtil.cpp)
-    
+
 add_test(NAME test_ConvUnify
     COMMAND test_ConvUnify)
 ################# test_BatchNorm #######################
diff --git a/paddle/gserver/tests/concat_slice_a.conf b/paddle/gserver/tests/concat_slice_a.conf
new file mode 100644
index 0000000000000000000000000000000000000000..dccf911089e16f4f97b1470ee39d192d4557d4bd
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_a.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 4), (4, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 5), (5, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/concat_slice_b.conf b/paddle/gserver/tests/concat_slice_b.conf
new file mode 100644
index 0000000000000000000000000000000000000000..29686ef2810370af3f84b60b2450d5c7d2e7663d
--- /dev/null
+++ b/paddle/gserver/tests/concat_slice_b.conf
@@ -0,0 +1,41 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+
+data = data_layer(name ="input", size=8*16*16)
+
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+proj1 = slice_projection(input=conv1, slices=[(0, 12)])
+
+proj2 = slice_projection(input=conv2, slices=[(1, 15)])
+
+concat = concat_layer(input=[proj1, proj2])
+
+outputs(concat)
+
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 67251f08e34faff57d9e6fd6a1163ba655619a8b..fe11278f41c0118ee0bdb34f17fbf9602e0fa76b 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -152,6 +152,26 @@ TEST(Projection, identity) {
   }
 }
 
+TEST(Projection, slice) {
+  ProjectionConfig conf;
+  conf.set_type("slice");
+  conf.set_input_size(100);
+  SliceConfig& slice1 = *conf.add_slices();
+  slice1.set_start(10);
+  slice1.set_end(20);
+  SliceConfig& slice2 = *conf.add_slices();
+  slice2.set_start(50);
+  slice2.set_end(70);
+  conf.set_output_size(30);
+  for (auto useGpu : {false, true}) {
+    testProjectionGrad(conf,
+                       INPUT_DATA,
+                       /* parameterSize */ 0,
+                       /* batchSize */ 10,
+                       useGpu);
+  }
+}
+
 TEST(Projection, scaling) {
   ProjectionConfig conf;
   conf.set_type("scaling");
@@ -347,6 +367,55 @@ TEST(Layer, CosSimVecMatLayer) {
   }
 }
 
+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
 void testConvLayer(const string& type, bool trans, bool useGpu) {
   TestConfig config;
   config.biasSize = 16;
@@ -1802,6 +1871,64 @@ TEST(Layer, RowConvLayer) {
   }
 }
 
+TEST(Layer, CropLayer) {
+  TestConfig config;
+  // config input_0
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ImageConfig* img = input->mutable_image_conf();
+  img->set_channels(4);
+  img->set_img_size(16);
+  config.layerConfig.set_axis(2);
+  config.layerConfig.add_offset(0);
+  config.layerConfig.add_offset(0);
+
+  // config input_1
+  config.inputDefs.push_back({INPUT_DATA, "layer_1", 128, 0});
+  input = config.layerConfig.add_inputs();
+  img = input->mutable_image_conf();
+  img->set_channels(2);
+  img->set_img_size(8);
+
+  // config crop layer
+  config.layerConfig.set_type("crop");
+  config.layerConfig.set_name("cropLayer");
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "crop", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, ClipLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("clip");
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ClipConfig* layerConf = input->mutable_clip_conf();
+  double p1 = std::rand() / (double)RAND_MAX;
+  double p2 = std::rand() / (double)RAND_MAX;
+  layerConf->set_min(std::min(p1, p2));
+  layerConf->set_max(std::max(p1, p2));
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "clip", batchSize, false, useGpu, false);
+  }
+}
+
+TEST(Layer, RowL2NormLayer) {
+  const size_t batchSize = 128;
+  const size_t size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_l2_norm");
+  config.layerConfig.set_size(size);
+  config.inputDefs.push_back({INPUT_DATA, "input", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_l2_norm", batchSize, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 40e662b22bac0a2d22aea31fe99b11695bac3f57..f930c72fde3f5e0a6a45cb6bfd3507a4f48028fc 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -237,6 +237,12 @@ TEST(Compare, concat_table) {
   compareNetwork(config_file_a, config_file_b);
 }
 
+TEST(Compare, concat_slice) {
+  std::string config_file_a = "./gserver/tests/concat_slice_a.conf";
+  std::string config_file_b = "./gserver/tests/concat_slice_b.conf";
+  compareNetwork(config_file_a, config_file_b);
+}
+
 #ifndef PADDLE_ONLY_CPU
 TEST(Compare, img_pool) {
   std::string config_file_a = "./gserver/tests/img_pool_a.conf";
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index de48b6fac9c7d8125a552022c52353ef6bcef995..6db5965789b3750f46731f157167150583130d0a 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -442,6 +442,12 @@ DEFINE_MATRIX_UNARY_PARAMETER_OP(Clip, TWO_PARAMETER,
 template<class T>
 void BaseMatrixT<T>::clip(T p1, T p2) { applyUnary(unary::Clip<T>(p1, p2)); }
 
+DEFINE_MATRIX_BINARY_PARAMETER_OP(ClipDerivative, TWO_PARAMETER, a = b < p1 ? 0 : (b > p2 ? 0 : 1));
+template<class T>
+void BaseMatrixT<T>::clipDerivative(BaseMatrixT& b, T p1, T p2) {
+  applyBinary(binary::ClipDerivative<T>(p1, p2), b);
+}
+
 DEFINE_MATRIX_UNARY_PARAMETER_OP(BiggerThanScalar, ONE_PARAMETER,
                                  a = a > p ? 1.0f : 0.0f);
 template<class T>
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 120d69f718b954925438fbd2119d69f0be13b3e9..12ad2d45a0bbff182e78da6efb3c5ff4c6b59b55 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -488,6 +488,13 @@ public:
    */
   void clip(T p1, T p2);
 
+  /**
+   * this = b < low ? 0 : 1
+   *
+   * this = b > high ? 0 : 1
+   */
+  void clipDerivative(BaseMatrixT& b, T p1, T p2);
+
   /**
    * @code
    * a = a > p ? 1.0f : 0.0f
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 7045562dd44f8f3e0be9181b32954c04f0865fa4..c8ba1074a1555bbddde7e5f0fb2a046138b27c09 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -243,7 +243,55 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 
+#endif
+
+#ifdef PADDLE_USE_MKL
 template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
 }
 #else
 
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8ada0d34c6733d13a45505492909124010c85a91..637643838ff433753e0cbb9154ee069c2f7c6d15 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,6 +15,12 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
+#ifdef PADDLE_USE_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 4431d613f655c1d0c8da13bb5ac9225980c650ad..27f7d95b752d4a423bf99fa425b10b2816575d6a 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  hl_expand_feature2col(feature.getData(),
-                        channels,
-                        feaImgHeight,
-                        feaImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(),
-                        channels,
-                        thisImgHeight,
-                        thisImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData(),
-                        alpha,
-                        beta);
-}
-
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1777,103 +1702,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  int channelsCol = channels * blockH * blockW;
-  real* srcData = feature.getData();
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockH / blockW;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        // no c_im*height to Exclude the channel number
-        int imgRowIdx = h * strideH + hOffset;
-        int imgColIdx = w * strideW + wOffset;
-        if ((imgRowIdx - paddingH) < 0 ||
-            (imgRowIdx - paddingH) >= feaImgHeight ||
-            (imgColIdx - paddingW) < 0 ||
-            (imgColIdx - paddingW) >= feaImgWidth) {
-          data_[(c * outputH + h) * outputW + w] = 0;
-        } else {
-          imgRowIdx += c_im * feaImgHeight - paddingH;
-          imgColIdx -= paddingW;
-          data_[(c * outputH + h) * outputW + w] =
-              srcData[imgRowIdx * feaImgWidth + imgColIdx];
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* expandData = expandFeat.getData();
-  int channelsCol = channels * blockH * blockW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockW / blockH;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        int imRowIdx = h * strideH + hOffset;
-        int imColIdx = w * strideW + wOffset;
-        if ((imRowIdx - paddingH) >= 0 &&
-            (imRowIdx - paddingH) < thisImgHeight &&
-            (imColIdx - paddingW) >= 0 &&
-            (imColIdx - paddingW) < thisImgWidth) {
-          imRowIdx += c_im * thisImgHeight - paddingH;
-          imColIdx -= paddingW;
-          data_[imRowIdx * thisImgWidth + imColIdx] =
-              alpha * expandData[(c * outputH + h) * outputW + w] +
-              beta * data_[imRowIdx * thisImgWidth + imColIdx];
-        }
-      }
-    }
-  }
-}
-
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 7dfd593225065e18830b2b0c0ce854fe7a2d5178..bb802bbb2c75289a45d987b22ad41ce8b1e95c98 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -859,49 +859,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
@@ -1335,34 +1292,6 @@ public:
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1522,34 +1451,6 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 7ce17a3207becb176a852a16fca52376009db9ee..4adaaef9838f0d178468af3af142031325bfc11d 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
   for (auto it : gpuAllocator_) {
     delete it;
   }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 354f58df39365410ff9aec2576c768e58db9e0d2..4980208e659233d50cd464dfeb213adfd2be3f38 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1141,4 +1141,64 @@ TEST(CpuMatrix, copyFrom) {
   TensorCheckEqual(cpu, copy);
 }
 
+void testBatch2seqPadding(int batchSize, int inputDim) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  IVectorPtr cpuSequence;
+  generateSequenceStartPositions(batchSize, cpuSequence);
+  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
+  gpuSequence->copyFrom(*cpuSequence);
+
+  size_t numSeq = cpuSequence->getSize() - 1;
+  size_t maxSeqLen = *std::max_element(cpuSequence->getData(),
+                                       cpuSequence->getData() + numSeq);
+
+  MatrixPtr cBatch = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr gBatch = std::make_shared<GpuMatrix>(numSeq * maxSeqLen, inputDim);
+  MatrixPtr cCheck = std::make_shared<CpuMatrix>(numSeq * maxSeqLen, inputDim);
+
+  hl_sequence2batch_copy_padding(gBatch->getData(),
+                                 gpuInput->getData(),
+                                 cpuSequence->getData(),
+                                 inputDim,
+                                 maxSeqLen,
+                                 numSeq,
+                                 false,
+                                 true);
+  cCheck->copyFrom(*gBatch);
+
+  int* seqStart = cpuSequence->getData();
+  float* batchData = cBatch->getData();
+  float* seqData = cpuInput->getData();
+  for (size_t i = 0; i < maxSeqLen; i++) {
+    for (size_t j = 0; j < numSeq; j++) {
+      size_t sequenceStart = seqStart[j];
+      size_t sequenceLength = seqStart[j + 1] - seqStart[j];
+      if (i < sequenceLength) {
+        memcpy(batchData + (i * numSeq + j) * inputDim,
+               seqData + (sequenceStart + i) * inputDim,
+               inputDim * sizeof(real));
+      } else {
+        memset(batchData + (i * numSeq + j) * inputDim,
+               0,
+               inputDim * sizeof(real));
+      }
+    }
+  }
+
+  TensorCheckErr(*cBatch, *cCheck);
+}
+
+TEST(Matrix, warpCTC) {
+  for (auto batchSize : {51, 526, 2884}) {
+    for (auto inputDim : {32, 512, 2026}) {
+      VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim;
+      testBatch2seqPadding(batchSize, inputDim);
+    }
+  }
+}
+
 #endif
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad31d13a00645aba6fc153d3d13da987..8035d93bfec75b20a54c5af0521ab724cafba8ca 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,16 @@
 add_subdirectory(detail)
+
+cc_library(memory SRCS memory.cc)
+cc_library(memcpy SRCS memcpy.cc DEPS device_context)
+
+cc_library(paddle_memory
+    DEPS
+    memory
+    memcpy
+    meta_data
+    meta_cache
+    memory_block
+    buddy_allocator
+    system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
index 96a331a486f57d3e030408fee182199bad5b38c2..7f95e80f980b0c0b93ecb418e6b923045313eaa5 100644
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
@@ -1,140 +1,4 @@
-## Design
+# Region-based Heterogeneous Memory Management
 
-### Usage
-
-To allocate 4KB CPU memory:
-
-```cpp
-p = memory::Alloc(platform::CPUPlace(), 4*1024);
-```
-
-To allocate 4KB memory on the 3rd GPU:
-
-```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
-```
-
-To free memory and check the so-far used amount of memory on a place:
-
-```cpp
-auto pl = platform::GPUPlace(0);
-p = memory::Alloc(pl, 4*1024);
-cout << memory::Used(pl);
-memory::Free(pl, p);
-```
-
-### API
-
-In `paddle/memory/memory.h` we have:
-
-```cpp
-namespace memory {
-template <typename Place> void* Alloc(Place, size_t);
-template <typename Place> void Free(Place, void*);
-template <typename Place> size_t Used(Place);
-}  // namespace memory
-```
-
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
-
-```cpp
-template<>
-void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
-  return GetCPUBuddyAllocator()->Alloc(size);
-}
-```
-
-and 
-
-```cpp
-template<>
-void Alloc<GPUPlace>(GPUPlace p, size_t size) {
-  return GetGPUBuddyAllocator(p.id)->Alloc(size);
-}
-```
-
-Similar specializations exist for `Free` and `Used`.
-
-### Implementation
-
-`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
-
-```cpp
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static BuddyAllocator* a = NULL;
-  if (a == NULL) {
-    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
-  }
-  return a;
-}
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator* as = NULL;
-  if (as == NULL) {
-    as = new BuddyAllocator*[platform::NumGPUs()];
-    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
-      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
-    }
-  }
-  return as[gpu_id);
-```
-
-#### `BuddyAllocator`
-
-`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
-
-```cpp
-BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
-  ...
-}
-```
-
-Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
-
-```cpp
-class BuddyAllocator {
- private:
-  struct Block {
-    size_t size;
-    Block* left, right;
-    size_t index; // allocator id
-  };
-  ...
-};
-```
-
-Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
-
-#### System Allocators
-
-The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
-
-## Justification
-
-I got inspiration from Majel and Caffe2, though above design look different from both.
-
-### Caffe2
-
-In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
-
-There are two implementations of `Context`:
-
-1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
-
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
-
-### Majel
-
-In Majel, there are basically two allocator types:
-
-1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
-1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
-
-However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
-
-In Majel there are hidden global variables like:
-
-1. `cpu::SystemAllocator g_cpu_allocator`, and
-1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
-
-Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
+Please check out the [design documentation](http://gangliao.me) to find out more details about
+buddy memory allocator for both CPU and GPU.
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 72d3749ad789eca9a4b10944131171c0cf8dfe5a..b9c3fc31c1523abf3acbd116745bbf1596454aac 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,7 +1,15 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
+
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+
+cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5eea4948339fb8c5584a5b9f5d71c752e..bb44970109c05d239e6b92d90b2079b752fa0104 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,315 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  VLOG(3) << "BuddyAllocator Disconstructor makes sure that all of these "
+             "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    VLOG(3) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(3) << "Allocate " << unaligned_size << " bytes from chunk size " << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    VLOG(3) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    VLOG(3) << "Allocation from existing memory block " << std::get<2>(*it)
+            << " at address "
+            << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  VLOG(3) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    VLOG(3) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    VLOG(3) << "Merging this block " << block << " with its right buddy "
+            << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    VLOG(3) << "Merging this block " << block << " with its left buddy "
+            << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+
+  // Dumping this block into pool
+  VLOG(3) << "Inserting free block (" << block << ", "
+          << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  // Clean up if existing too much free memory
+
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+
+size_t BuddyAllocator::Used() { return total_used_; }
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  VLOG(3) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  VLOG(3) << "Creating and inserting new block " << p
+          << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+
+  VLOG(3) << "Split block (" << block << ", " << block->total_size(cache_)
+          << ") into";
+  block->split(cache_, size);
+
+  VLOG(3) << "Left block (" << block << ", " << block->total_size(cache_)
+          << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      VLOG(3) << "Insert right block (" << block->right_buddy(cache_) << ", "
+              << block->right_buddy(cache_)->total_size(cache_) << ")";
+
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+
+  return block;
+}
+
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    VLOG(3) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    VLOG(3) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc719966b4074449ce1ef7193c73dc265..4fa3fb0ee5f826d2b084c0ba184c505aee3acc48 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,9 +14,16 @@
 
 #pragma once
 
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <mutex>
+#include <set>
+#include <unordered_map>
 #include <vector>
 
 namespace paddle {
@@ -25,61 +32,80 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
   void Free(void*);
   size_t Used();
 
+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-};
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc40993208323f1f5d18103165c8835b5f829613
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5168b519f3a3747f34ef2ea7b87d72dce70064d
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declarations
+class MetadataCache;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30ff80e7bac0b595fe60aeab0a3c59f4e23eae2d
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca0789779e273fb71c3d6282c0a921cda2d776cc
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70c5c1f439e84ec33cf0507beae33f9cdfa51727
--- /dev/null
+++ b/paddle/memory/detail/meta_data.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..628cf1f2e347e288d1bf34c14c7b2f13a28d3662
--- /dev/null
+++ b/paddle/memory/detail/meta_data.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 50bec926f83dee8a4343d0b16aeb088f9d2a4871..f61e67a32906083881dd7f47433521876be9b355 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,76 +13,127 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 
 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
+  index = 0;  // unlock memory
+
   void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
   }
+
   return p;
 }
 
-void CPUAllocator::Free(void* p, size_t size) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
     munlock(p, size);
   }
   free(p);
 }
 
+bool CPUAllocator::UseGpu() const { return false; }
+
 #ifndef PADDLE_ONLY_CPU
 
-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;
 
+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t usable = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= usable) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      gpu_alloc_size_ += size;
+      return p;
+    }
+  }
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
   }
-  return result == cudaSuccess ? p : nullptr;
+
+  return nullptr;
 }
 
-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    PADDLE_ENFORCE(err, "cudaFree{Host} failed in GPUAllocator::Free.");
   }
 }
 
+bool GPUAllocator::UseGpu() const { return true; }
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 184b383f7f78244fa6632a3bffb1a0a78b3aa664..82ba322e057575c460b1d51d719c9b0fa459273e 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,31 +20,36 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
 };
 
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 9bd5706a4e4d1546a8c879ebbac0f3349c9d59f6..ba44e06ddb68e92e4086a8006b868557b0c89b50 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aaab1142ca18d3319469a4d685fde9d30929113f
--- /dev/null
+++ b/paddle/memory/memcpy.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memcpy.h"
+
+#include <cstring>  // for memcpy
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace memory {
+
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::SetDeviceId(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::SetDeviceId(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::SetDeviceId(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b9c0eada6e8406fc81baec7f331a8dd5b8b0ec1
--- /dev/null
+++ b/paddle/memory/memcpy.h
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   CUDA stream.
+ *
+ * \note    For GPU memory copy, CUDA stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 0d123d99e234a378ee64850eebacece223e2b121..207025f9b1c64f0f8943f9fae5edefc9328a1d26 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -15,45 +15,71 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
 
-#include <boost/variant.hpp>
+#include <cstring>  // for memcpy
 
 namespace paddle {
 namespace memory {
 
-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
   }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return a;
 }
 
-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
 }
 
-size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
   }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  platform::SetDeviceId(gpu_id);
+  return as[gpu_id];
+}
+
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
+
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
+}
+
+#endif  // PADDLE_ONLY_CPU
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index a33092bade65e6df0faee226a8967c9fc9caa032..44f567caf9c19775f17988b5142b7693b41a126d 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,14 +14,66 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"
 
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
+template <typename Place>
+void* Alloc(Place place, size_t size);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
+template <typename Place>
+void Free(Place place, void* ptr);
+
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
+template <typename Place>
+size_t Used(Place place);
+
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
+class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
+ public:
+  PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53cc63a098d0802479e3a371717adb7596c249ed
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6465deeec93100f0238ac850b92f7f7c5a60b795
--- /dev/null
+++ b/paddle/operators/CMakeLists.txt
@@ -0,0 +1,64 @@
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(op_common_deps operator op_registry)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    foreach(src ${op_library_SRCS})
+        if (${src} MATCHES ".*\\.cu$")
+            list(APPEND cu_srcs ${src})
+        elseif(${src} MATCHES ".*\\.cc$")
+            list(APPEND cc_srcs ${src})
+        else()
+            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+        endif()
+    endforeach()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+
+    list(LENGTH cu_srcs cu_srcs_len)
+    list(LENGTH op_library_DEPS dep_len)
+    if (${cu_srcs_len} EQUAL 0 AND ${dep_len} EQUAL 0)
+        message(WARNING "The op library ${TARGET} not support GPU!")
+    endif()
+
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+endfunction()
+
+op_library(add_op SRCS add_op.cc add_op.cu)
+cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
+
+op_library(mean_op SRCS mean_op.cc mean_op.cu)
+cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op)
+
+op_library(mul_op SRCS mul_op.cc mul_op.cu)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+
+op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
+op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
+
+op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+
+op_library(fc_op
+    SRCS fc_op.cc
+    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net)
+op_library(recurrent_op SRCS recurrent_op.cc DEPS op_desc tensor op_registry operator net)
+cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..85269a5f7445a1745d9be68417789e33eb725d5c
--- /dev/null
+++ b/paddle/operators/add_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/add_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AddOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of AddOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
+                   "Inputs of AddOp must all be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Outputs of AddOp must all be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims() == ctx.Input<Tensor>(1)->dims(),
+                   "Two input of Add Op's dimension must be same.");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+  }
+};
+
+class AddOpMaker : public OpProtoAndCheckerMaker {
+public:
+  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of add op");
+    AddInput("Y", "The second input of add op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Two Element Add Operator.
+
+The equation is: Out = X + Y
+)DOC");
+  }
+};
+
+class AddOpGrad : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker);
+REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad);
+REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f961b37565f400b5c26844b9e7a3cff5e682340b
--- /dev/null
+++ b/paddle/operators/add_op.cu
@@ -0,0 +1,5 @@
+#define EIGEN_USE_GPU
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/add_op.h"
+
+REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..54d2231425293f6cfb3adc9cb34d903a75fcdcd0
--- /dev/null
+++ b/paddle/operators/add_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AddKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto input0 = context.Input<Tensor>(0);
+    auto input1 = context.Input<Tensor>(1);
+    auto output = context.Output<Tensor>(0);
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input0);
+    auto Y = EigenVector<T>::Flatten(*input1);
+    auto Z = EigenVector<T>::Flatten(*output);
+
+    auto place = context.GetEigenDevice<Place>();
+
+    Z.device(place) = X + Y;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d52f5498323dbb7ca0ff25d038947f0ddb2017e
--- /dev/null
+++ b/paddle/operators/add_op_test.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#define private public
+#include <paddle/framework/op_registry.h>
+USE_OP(add_two);
+// USE_OP(add_two_grad);
+
+TEST(AddOp, GetOpProto) {
+  auto& protos = paddle::framework::OpRegistry::protos();
+  auto it = protos.find("add_two");
+  ASSERT_NE(it, protos.end());
+  auto& op_creators = paddle::framework::OpRegistry::op_creators();
+  auto it1 = op_creators.find("add_two_grad");
+  ASSERT_NE(it1, op_creators.end());
+}
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4f5b935fde4d5b0d9efae66554cf890291e26941
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class OnehotCrossEntropyOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2,
+                   "Input size of OnehotCrossEntropyOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1,
+                   "Output size of OnehotCrossEntropyOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.InputVar(1) != nullptr,
+                   "Inputs of OnehotCrossEntropyOp must all be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Outputs of OnehotCrossEntropyOp must all be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
+                   "X's dimension must be 2.");
+    PADDLE_ENFORCE(ctx.Output<Tensor>(0)->dims().size() == 1,
+                   "label's dimension must be 1.");
+    ctx.Output<Tensor>(0)->Resize({ctx.Input<Tensor>(0)->dims()[0]});
+  }
+};
+
+class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
+public:
+  OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of OnehotCrossEntropyOp");
+    AddInput("label", "The second input of OnehotCrossEntropyOp");
+    AddOutput("Y", "The output of OnehotCrossEntropyOp");
+    AddComment(R"DOC(
+OnehotCrossEntropy Operator.
+
+                Y[i] = -log(X[i][j])
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(onehot_cross_entropy,
+            ops::OnehotCrossEntropyOp,
+            ops::OnehotCrossEntropyOpMaker);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..926a0c616b957d8e542c1f3dee227a718fb29f07
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.cu
@@ -0,0 +1,5 @@
+#define EIGEN_USE_GPU
+#include "paddle/operators/cross_entropy_op.h"
+
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3a3728149950a5c7f2195122e8e0ff728492bdb
--- /dev/null
+++ b/paddle/operators/cross_entropy_op.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class OnehotCrossEntropyOpKernel : public OpKernel {
+public:
+  constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
+
+  void Compute(const ExecutionContext& ctx) const override {
+    auto X = ctx.Input<Tensor>(0);
+    const T* X_data = X->data<T>();
+    const int* label_data = ctx.Input<Tensor>(1)->data<int>();
+    auto Y = ctx.Output<Tensor>(0);
+
+    Y->mutable_data<T>(ctx.GetPlace());
+
+    T* Y_data = Y->data<T>();
+
+    int batch_size = X->dims()[0];
+    int class_num = X->dims()[1];
+
+    // Y[i] = -log(X[i][j])
+    for (int i = 0; i < batch_size; ++i) {
+      Y_data[i] = -std::log(
+          std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD()));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71ceda958770796693265c08cb1fcae27e79bcd9
--- /dev/null
+++ b/paddle/operators/fc_op.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+class FullyConnectedOp : public NetOp {
+public:
+  void Init() override {
+    AddOp(OpRegistry::CreateOp("mul",
+                               {
+                                   Input("X"), Input("W"),
+                               },
+                               {Output("before_act")},
+                               {}));
+    auto b = Input("b");
+    if (b != EMPTY_VAR_NAME()) {
+      AddOp(OpRegistry::CreateOp("rowwise_add",
+                                 {Output("before_act"), Input("b")},
+                                 {Output("before_act")},
+                                 {}));
+    }
+
+    auto activation = GetAttr<std::string>("activation");
+    AddOp(OpRegistry::CreateOp(
+        activation, {Output("before_act")}, {Output("Y")}, {}));
+    CompleteAddOp(false);
+  }
+};
+
+class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
+public:
+  FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "the input of fc operator");
+    AddInput("W", "the weight of fc operator");
+    AddInput("b", "the bias of fc operator");
+
+    AddOutput("Y", "the output of fc operator");
+    AddOutput("before_act", "the before activation output of fc operator")
+        .SetTemporary();
+    AddAttr<std::string>("activation", "The activation key for fc layer")
+        .SetDefault("sigmoid")
+        .InEnum({"sigmoid", "softmax"});
+
+    //! TODO(yuyang18): Complete comment;
+    AddComment("FullyConnected Operator");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+USE_OP(mul);
+USE_OP(rowwise_add);
+USE_OP(sigmoid);
+USE_OP(softmax);
+
+REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker);
diff --git a/paddle/operators/fill_zeros_like_op.cc b/paddle/operators/fill_zeros_like_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..79a0e3d7e911b728a7a96ceff573976ba2b2e37f
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/fill_zeros_like_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+class FillZerosLikeOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1UL,
+                   "Input size of FillZerosLikeOp must be one.");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1UL,
+                   "Output size of AddOp must be one.");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr,
+                   "Input of FillZerosLikeOp must be set.");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr,
+                   "Output of FillZerosLikeOp must be set.");
+    ctx.Output<framework::Tensor>(0)->Resize(
+        ctx.Input<framework::Tensor>(0)->dims());
+  }
+};
+
+class FillZerosLikeOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  FillZerosLikeOpMaker(framework::OpProto *proto,
+                       framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Src", "The input of fill-zeros-like op.");
+    AddOutput("Dst", "The varibale will be filled up with zeros.");
+    AddComment(R"DOC(
+Fill up a vriable with zeros.
+
+The output will have the same size with input.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(fill_zeros_like,
+            paddle::operators::FillZerosLikeOp,
+            paddle::operators::FillZerosLikeOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    fill_zeros_like,
+    paddle::operators::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/fill_zeros_like_op.cu b/paddle/operators/fill_zeros_like_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..55ad58f4f17cd4a3e737c01b001675d2690d273e
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -0,0 +1,6 @@
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/fill_zeros_like_op.h"
+
+REGISTER_OP_GPU_KERNEL(
+    fill_zeros_like,
+    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/fill_zeros_like_op.h b/paddle/operators/fill_zeros_like_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..05272964abd43bdc2bd5c3cae8b128099e1c888c
--- /dev/null
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class FillZerosLikeKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* output = context.Output<framework::Tensor>(0);
+    output->mutable_data<T>(context.GetPlace());
+    framework::EigenVector<T>::Flatten(*output).setZero();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mean_op.cc b/paddle/operators/mean_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..78131b26808b183ee107313374493ae870f1b641
--- /dev/null
+++ b/paddle/operators/mean_op.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/mean_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MeanOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Input size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr && ctx.OutputVar(0) != nullptr,
+                   "Input/Output of MeanOp must be initialized.");
+    ctx.Output<Tensor>(0)->Resize(framework::make_ddim({1}));
+  }
+};
+
+class MeanOpMaker : public OpProtoAndCheckerMaker {
+public:
+  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The input of mean op");
+    AddOutput("Out", "The output of mean op").IgnoreGradient();
+    AddComment("Mean Operator");
+  }
+};
+
+class MeanGradOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    ctx.Output<Tensor>("X" + GRAD_VAR_SUFFIX())
+        ->Resize(ctx.Input<Tensor>("X")->dims());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
+REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
+REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp);
+REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/mean_op.cu b/paddle/operators/mean_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e15de2fd0dd84e4015ee0e3b5343d7651b027a88
--- /dev/null
+++ b/paddle/operators/mean_op.cu
@@ -0,0 +1,6 @@
+#define EIGEN_USE_GPU
+
+#include "paddle/operators/mean_op.h"
+
+REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mean_op.h b/paddle/operators/mean_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..a89cb422f9b296dba6eb5358043f73d00aefc5d3
--- /dev/null
+++ b/paddle/operators/mean_op.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MeanKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input);
+    auto y = EigenScalar<T>::From(*output);
+    auto place = context.GetEigenDevice<Place>();
+
+    y.device(place) = X.mean();
+  }
+};
+
+template <typename Place, typename T>
+class MeanGradKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto OG = context.Input<Tensor>("Out" + OperatorBase::GRAD_VAR_SUFFIX());
+    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
+                   "Mean Gradient should be scalar");
+    auto IG = context.Output<Tensor>("X" + OperatorBase::GRAD_VAR_SUFFIX());
+    IG->mutable_data<T>(context.GetPlace());
+
+    T ig_size = (T)framework::product(IG->dims());
+
+    EigenVector<T>::Flatten(*IG).device(*(context.GetEigenDevice<Place>())) =
+        EigenScalar<T>::From(*OG) / ig_size;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/mean_op_test.cc b/paddle/operators/mean_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..375dcd50e130355c60f82b9d39d1b94fb2c911b0
--- /dev/null
+++ b/paddle/operators/mean_op_test.cc
@@ -0,0 +1,25 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include <paddle/framework/op_registry.h>
+
+USE_OP(mean);
+
+TEST(MeanOp, GetOpProto) {
+  auto& protos = paddle::framework::OpRegistry::protos();
+  auto it = protos.find("mean");
+  ASSERT_NE(it, protos.end());
+}
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d127f3a302a340fe7558f918d6eeb2ea0a3fafe7
--- /dev/null
+++ b/paddle/operators/mul_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/mul_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MulOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
+    auto dim0 = ctx.Input<Tensor>(0)->dims();
+    auto dim1 = ctx.Input<Tensor>(1)->dims();
+    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+                   "The input of mul op must be matrix");
+    PADDLE_ENFORCE(
+        dim0[1] == dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The mul op must take one output");
+    ctx.Output<Tensor>(0)->Resize({dim0[0], dim1[1]});
+  }
+};
+
+class MulOpMaker : public OpProtoAndCheckerMaker {
+public:
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+
+class MulOpGrad : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "MulGrad";
+    return "";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker);
+REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad);
+
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dc9236701627dc9335b844d2a82e18eb1f7dfd42
--- /dev/null
+++ b/paddle/operators/mul_op.cu
@@ -0,0 +1,18 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/mul_op.h"
+
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7b78ad39045d25d73bfc2c930063c255a514864
--- /dev/null
+++ b/paddle/operators/mul_op.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class MulKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
+        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
+
+    auto input0 = context.Input<Tensor>("X");
+    auto input1 = context.Input<Tensor>("Y");
+    auto output = context.Output<Tensor>(0);
+
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenMatrix<T>::From(*input0);
+    auto Y = EigenMatrix<T>::From(*input1);
+    auto Z = EigenMatrix<T>::From(*output);
+    auto place = context.GetEigenDevice<Place>();
+
+    Z.device(place) = X.contract(Y, dim_pair);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e5b76e3724b5b0287071c90d26235b8e1a1d80cf
--- /dev/null
+++ b/paddle/operators/recurrent_op.cc
@@ -0,0 +1,388 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/recurrent_op.h"
+
+#include <glog/logging.h>
+#include <cstring>
+#include <sstream>
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+namespace rnn {
+
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& inlinks,
+                   const size_t seq_len,
+                   bool infer_shape_mode) {
+  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    auto input_var = step_scopes[0]->FindVar(inlinks[i].external);
+    PADDLE_ENFORCE(input_var != nullptr,
+                   "input link [%s] is not in scope.",
+                   inlinks[i].external);
+    Tensor* input = input_var->GetMutable<Tensor>();
+    DDim dims = input->dims();
+    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
+                   "all the inlinks must have same length");
+    DDim step_dims = slice_ddim(dims, 1, dims.size());
+    for (size_t j = 0; j < seq_len; j++) {
+      Tensor* step_input =
+          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
+      if (!infer_shape_mode) {
+        *step_input = input->Slice<float>(j, j + 1);
+      }
+      step_input->Resize(step_dims);
+    }
+  }
+}
+
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& outlinks,
+                   const size_t seq_len,
+                   bool infer_shape_mode) {
+  for (size_t i = 0; i < outlinks.size(); i++) {
+    auto output_var = step_scopes[0]->FindVar(outlinks[i].external);
+    PADDLE_ENFORCE(output_var != nullptr,
+                   "output link [%s] is not in scope.",
+                   outlinks[i].external);
+    Tensor* output = output_var->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      DDim step_dims = step_scopes[0]
+                           ->FindVar(outlinks[i].internal)
+                           ->GetMutable<Tensor>()
+                           ->dims();
+      std::vector<int> dims_vec = vectorize(step_dims);
+      dims_vec.insert(dims_vec.begin(), seq_len);
+      output->Resize(make_ddim(dims_vec));
+    } else {
+      output->mutable_data<float>(platform::CPUPlace());
+      for (size_t j = 0; j < seq_len; j++) {
+        Tensor* step_output =
+            step_scopes[j]->FindVar(outlinks[i].internal)->GetMutable<Tensor>();
+        // TODO(luotao02) data type and platform::DeviceContext() should set
+        // correctly
+        (output->Slice<float>(j, j + 1))
+            .CopyFrom<float>(*step_output, platform::CPUPlace());
+      }
+    }
+  }
+}
+
+void LinkMemories(const std::vector<Scope*>& scopes,
+                  const std::vector<rnn::MemoryAttr>& memories,
+                  const size_t step_id,
+                  const int offset,
+                  bool infer_shape_mode) {
+  PADDLE_ENFORCE(step_id < scopes.size(),
+                 "step [%d] is out of range of step scopes' size [%d]",
+                 step_id,
+                 scopes.size());
+  PADDLE_ENFORCE(static_cast<int>(step_id) + offset >= 0,
+                 "offset [%d] must be large than -[%d]",
+                 offset,
+                 step_id);
+  PADDLE_ENFORCE(step_id + offset < scopes.size(),
+                 "offset [%d] is out of range, it must be less than (%d - %d)",
+                 offset,
+                 scopes.size(),
+                 step_id);
+  auto scope = scopes[step_id];
+  auto linked_scope = scopes[step_id + offset];
+  for (auto& attr : memories) {
+    auto mem = scope->FindVar(attr.pre_var)->GetMutable<Tensor>();
+    auto linked_mem = linked_scope->FindVar(attr.var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      mem->Resize(linked_mem->dims());
+    } else {
+      mem->ShareDataWith<float>(*linked_mem);
+    }
+  }
+}
+
+void InitArgument(const ArgumentName& name,
+                  Argument* arg,
+                  const OperatorBase& op) {
+  arg->step_net = op.Input(name.step_net);
+  arg->step_scopes = op.Output(name.step_scopes);
+
+  auto inlinks = op.Inputs(name.inlinks);
+  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
+  PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
+                 "the size of inlinks and inlink_alias don't match:%d,%d",
+                 inlinks.size(),
+                 inlink_alias.size());
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = inlinks[i];
+    link.internal = inlink_alias[i];
+    (arg->inlinks).push_back(link);
+  }
+
+  auto outlinks = op.Outputs(name.outlinks);
+  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
+  PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
+                 "the size of outlinks and outlink_alias don't match:%d,%d",
+                 outlinks.size(),
+                 outlink_alias.size());
+  for (size_t i = 0; i < outlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = outlinks[i];
+    link.internal = outlink_alias[i];
+    (arg->outlinks).push_back(link);
+  }
+
+  auto boot_memories = op.Inputs(name.boot_memories);
+
+  // attributes
+  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
+  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
+
+  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
+                 "the size of memories, boot_memories don't match:%d,%d",
+                 memories.size(),
+                 boot_memories.size());
+  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
+                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 pre_memories.size(),
+                 boot_memories.size());
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+
+  for (size_t i = 0; i < memories.size(); ++i) {
+    rnn::MemoryAttr mem_attr;
+    mem_attr.var = memories[i];
+    mem_attr.pre_var = pre_memories[i];
+    mem_attr.boot_var = boot_memories[i];
+    (arg->memories).push_back(mem_attr);
+  }
+}
+
+}  // namespace rnn
+
+void RecurrentAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
+                 ->GetMutable<Tensor>()
+                 ->dims()[0];
+  CreateScopes(scope);
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(
+      step_scopes, arg_->inlinks, seq_len_, true /*infer_shape_mode*/);
+  InitMemories(step_scopes[0], true /*infer_shape_mode*/);
+  Variable* net = scope.FindVar(arg_->step_net);
+  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
+  for (size_t i = 0; i < seq_len_; i++) {
+    if (i > 0) {
+      rnn::LinkMemories(
+          step_scopes, arg_->memories, i, -1, true /*infer_shape_mode*/);
+    }
+    net->GetMutable<NetOp>()->InferShape(*step_scopes[i]);
+  }
+  rnn::ConcatOutputs(
+      step_scopes, arg_->outlinks, seq_len_, true /*infer_shape_mode*/);
+}
+
+void RecurrentAlgorithm::Run(const Scope& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(
+      step_scopes, arg_->inlinks, seq_len_, false /*infer_shape_mode*/);
+  InitMemories(step_scopes[0], false /*infer_shape_mode*/);
+  Variable* net = scope.FindVar(arg_->step_net);
+
+  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
+    if (step_id > 0) {
+      rnn::LinkMemories(
+          step_scopes, arg_->memories, step_id, -1, false /*infer_shape_mode*/);
+    }
+    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
+  }
+  rnn::ConcatOutputs(
+      step_scopes, arg_->outlinks, seq_len_, false /*infer_shape_mode*/);
+}
+
+void RecurrentAlgorithm::CreateScopes(const Scope& scope) const {
+  // TODO(xxx) Only two scopes are needed for inference, this case will be
+  // supported later.
+  auto step_scopes =
+      scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
+
+  if (seq_len_ > step_scopes->size()) {
+    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
+      auto& step_scope = scope.NewScope();
+
+      // Now all variables in scope must be created outside of op.
+      auto net_op = scope.FindVar(arg_->step_net)->GetMutable<NetOp>();
+      for (auto& input : net_op->inputs_) {
+        // the weight are located in parent scope
+        if (!step_scope.FindVar(input)) step_scope.NewVar(input);
+      }
+      for (auto& output : net_op->outputs_) {
+        step_scope.NewVar(output);
+      }
+      step_scopes->emplace_back(&step_scope);
+    }
+  }
+}
+
+void RecurrentAlgorithm::InitMemories(Scope* step_scope,
+                                      bool infer_shape_mode) const {
+  for (auto& attr : arg_->memories) {
+    Tensor* pre_mem = step_scope->NewVar(attr.pre_var)->GetMutable<Tensor>();
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
+                   "memory [%s]'s boot variable [%s] not exists",
+                   attr.var,
+                   attr.boot_var);
+    Tensor* boot_mem = step_scope->FindVar(attr.boot_var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      pre_mem->Resize(boot_mem->dims());
+    } else {
+      pre_mem->ShareDataWith<float>(*boot_mem);
+    }
+  }
+}
+
+const rnn::ArgumentName RecurrentOp::kArgName{"step_net",
+                                              "step_scopes",
+                                              "inlinks",
+                                              "outlinks",
+                                              "inlink_alias",
+                                              "outlink_alias",
+                                              "memories",
+                                              "pre_memories",
+                                              "boot_memories"};
+
+const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net",
+                                                      "step_scopes",
+                                                      "outlink@grad",
+                                                      "inlink@grad",
+                                                      "inlink_alias",
+                                                      "outlink_alias",
+                                                      "memories",
+                                                      "pre_memories",
+                                                      "boot_memories@grad"};
+
+void RecurrentOp::Init() {
+  OperatorBase::Init();
+  std::unique_ptr<rnn::Argument> arg(new rnn::Argument());
+  rnn::InitArgument(kArgName, arg.get(), *this);
+  alg_.Init(std::move(arg));
+}
+
+class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+public:
+  RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
+                                         OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    const auto& name = RecurrentOp::kArgName;
+    // inputs and outputs stored in proto
+    AddInput(name.inlinks,
+             "the inputs that need to be segmented for each step.")
+        .SetMultiple();
+    AddInput(name.boot_memories, "variables to initialize memories.")
+        .SetMultiple();
+    AddInput(name.step_net, "network shared by all steps.");
+
+    AddOutput(name.outlinks, "the outputs that need to concated for all steps.")
+        .SetMultiple();
+    AddOutput(name.step_scopes, "step scopes");
+
+    // Attributes stored in AttributeMap
+    AddAttr<std::vector<std::string>>(name.inlink_alias, "alias of inlinks");
+    AddAttr<std::vector<std::string>>(name.outlink_alias, "alias of outlinks");
+    AddAttr<std::vector<std::string>>(name.pre_memories,
+                                      "names of pre-memories");
+    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
+
+    AddComment("This is a recurrent group operator.");
+  }
+};
+
+void RecurrentGradientAlgorithm::Run(
+    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(
+      step_scopes, arg_->inlinks, seq_len_, false /*infer_shape_mode*/);
+  Variable* net = scope.FindVar(arg_->step_net);
+  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(
+          step_scopes, arg_->memories, step_id, 1, false /*infer_shape_mode*/);
+    }
+    net->GetMutable<NetOp>()->Run(*step_scopes[step_id], dev_ctx);
+  }
+  LinkBootMemoryGradients(step_scopes[0], false);
+  rnn::ConcatOutputs(
+      step_scopes, arg_->outlinks, seq_len_, false /*infer_shape_mode*/);
+}
+
+void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
+    Scope* step_scope, bool infer_shape_mode) const {
+  for (auto& attr : arg_->memories) {
+    PADDLE_ENFORCE(step_scope->FindVar(attr.var) != nullptr,
+                   "memory variable [%s] does not exists",
+                   attr.var);
+    PADDLE_ENFORCE(step_scope->FindVar(attr.boot_var) != nullptr,
+                   "boot variable [%s] does not exists",
+                   attr.boot_var);
+    Tensor* mem_grad = step_scope->NewVar(attr.var)->GetMutable<Tensor>();
+    Tensor* boot_mem_grad =
+        step_scope->NewVar(attr.boot_var)->GetMutable<Tensor>();
+    if (infer_shape_mode) {
+      boot_mem_grad->Resize(mem_grad->dims());
+    } else {
+      boot_mem_grad->ShareDataWith<float>(*mem_grad);
+    }
+  }
+}
+
+void RecurrentGradientAlgorithm::InferShape(const Scope& scope) const {
+  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
+                 ->GetMutable<Tensor>()
+                 ->dims()[0];
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(
+      step_scopes, arg_->inlinks, seq_len_, true /*infer_shape_mode*/);
+  Variable* net = scope.FindVar(arg_->step_net);
+  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(
+          step_scopes, arg_->memories, step_id, 1, true /*infer_shape_mode*/);
+    }
+    net->GetMutable<NetOp>()->InferShape(*step_scopes[step_id]);
+  }
+  rnn::ConcatOutputs(
+      step_scopes, arg_->outlinks, seq_len_, true /*infer_shape_mode*/);
+  LinkBootMemoryGradients(step_scopes[0], true /*infer_shape_mode*/);
+}
+
+void RecurrentGradientOp::Init() {
+  OperatorBase::Init();
+  std::unique_ptr<rnn::Argument> arg(new rnn::Argument());
+  rnn::InitArgument(kArgName, arg.get(), *this);
+  alg_.Init(std::move(arg));
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(recurrent_op,
+            paddle::operators::RecurrentOp,
+            paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
diff --git a/paddle/operators/recurrent_op.h b/paddle/operators/recurrent_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a0964fff326500b6215dd4afac63c75d64c4a06
--- /dev/null
+++ b/paddle/operators/recurrent_op.h
@@ -0,0 +1,213 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using namespace paddle::framework;
+
+namespace rnn {
+
+/**
+ * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
+ *
+ * Memory attributes cached by this op, dims will be infered from
+ * boot memories in father scope. Other attributes are copied from Op's proto
+ * attributes.
+ */
+struct MemoryAttr {
+  // name of current state variable
+  std::string var;
+  // name of previous step's state variable
+  std::string pre_var;
+  // name of the variables to init this memory (same role of `boot_layer` in
+  // PaddlePaddle), which is store in father's scope.
+  std::string boot_var;
+};
+
+struct Link {
+  // input or output links name.
+  std::string internal;
+  // alias to avoid duplicate keys in scopes.
+  std::string external;
+};
+
+struct Argument {
+  std::string step_net;
+  std::string step_scopes;
+  std::vector<Link> inlinks;
+  std::vector<Link> outlinks;
+  std::vector<rnn::MemoryAttr> memories;
+};
+
+struct ArgumentName {
+  std::string step_net;
+  std::string step_scopes;
+  std::string inlinks;
+  std::string outlinks;
+  std::string inlink_alias;   // the alias of inlinks in step net.
+  std::string outlink_alias;  // the alias of outlinks in step net.
+  std::string memories;       // the memory name
+  std::string pre_memories;   // the previous memory name
+  std::string boot_memories;  // the boot memory name
+};
+
+/**
+ * Prepare inputs for each step net.
+ */
+void SegmentInputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& inlinks,
+                   const size_t seq_len,
+                   bool infer_shape_mode);
+
+/**
+ * Process outputs of step nets and merge to variables.
+ */
+void ConcatOutputs(const std::vector<Scope*>& step_scopes,
+                   const std::vector<Link>& outlinks,
+                   const size_t seq_len,
+                   bool infer_shape_mode);
+
+void LinkMemories(const std::vector<Scope*>& step_scopes,
+                  const std::vector<MemoryAttr>& memories,
+                  const size_t step_id,
+                  const int offset,
+                  bool infer_shape_mode);
+
+void InitArgument(const ArgumentName& name, Argument* arg);
+
+};  // namespace rnn
+
+// The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
+// TODO:
+// 1. No-padding computing for sequences with indifinite length in one batch.
+// 2. Hierarchical RNN for sequence with sub-sequence.
+// 3. Internal Memory.
+// 4. More Complex RNN architecture, such as Gated Feedback RNN.
+//    Refer to: https://arxiv.org/pdf/1502.02367.pdf
+
+class RecurrentAlgorithm {
+public:
+  void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const;
+
+  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const Scope& scope) const;
+
+protected:
+  /*
+   * The step scopes will be stored in the father scope as a variable.
+   *
+   * NOTE the scopes are reused in both the forward and backward, so just
+   * create once and expand its size if more steps need.
+   */
+  void CreateScopes(const Scope& scope) const;
+
+  const std::vector<Scope*>& GetStepScopes(const Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
+  }
+
+  void InitMemories(Scope* step_scopes, bool infer_shape_mode) const;
+
+private:
+  std::unique_ptr<rnn::Argument> arg_;
+  mutable size_t seq_len_;
+};
+
+class RecurrentGradientAlgorithm {
+  /**
+   * RNN's backward alogorithm.
+   *
+   * To accelerate the development of RecurrentGradientOp, we decouple RNN's
+   * algorithm and `OperatorBase`'s implementation, the former contains the core
+   * implementation of a RNN, and will keep stable even if the framework changes
+   * a
+   * lot, and the latter is a wrapper acts like an dapter for it to make RNN an
+   * operator.
+   */
+public:
+  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
+
+  void Run(const Scope& scope, const platform::DeviceContext& dev_ctx) const;
+
+  void LinkBootMemoryGradients(Scope* step_scopes, bool infer_shape_mode) const;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const Scope& scope) const;
+
+protected:
+  inline const std::vector<Scope*>& GetStepScopes(const Scope& scope) const {
+    return *scope.FindVar(arg_->step_scopes)->GetMutable<std::vector<Scope*>>();
+  }
+
+private:
+  std::unique_ptr<rnn::Argument> arg_;
+  mutable size_t seq_len_;
+};
+
+class RecurrentOp final : public OperatorBase {
+public:
+  void Init() override;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  virtual void InferShape(const Scope& scope) const override {
+    alg_.InferShape(scope);
+  }
+
+  virtual void Run(const Scope& scope,
+                   const platform::DeviceContext& dev_ctx) const override {
+    alg_.Run(scope, dev_ctx);
+  }
+
+  static const rnn::ArgumentName kArgName;
+
+private:
+  RecurrentAlgorithm alg_;
+};
+
+class RecurrentGradientOp final : public OperatorBase {
+public:
+  void Init() override;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  virtual void InferShape(const Scope& scope) const override {
+    alg_.InferShape(scope);
+  }
+
+  virtual void Run(const Scope& scope,
+                   const platform::DeviceContext& dev_ctx) const override {
+    alg_.Run(scope, dev_ctx);
+  }
+
+  static const rnn::ArgumentName kArgName;
+
+private:
+  RecurrentGradientAlgorithm alg_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_op_test.cc b/paddle/operators/recurrent_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..91f2972ca49953fd7a627289fa37db32916d85cd
--- /dev/null
+++ b/paddle/operators/recurrent_op_test.cc
@@ -0,0 +1,392 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/recurrent_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RecurrentOpTest : public ::testing::Test {
+protected:
+  virtual void SetUp() override {
+    CreateGlobalVariables();
+    CreateStepNet();
+    CreateRNNOp();
+  }
+
+  virtual void TearDown() override {}
+
+  void CreateGlobalVariables() {
+    // create input, and init content
+    LOG(INFO) << "create global variable x";
+    for (auto inlink : std::vector<std::string>{"x", "x0", "x1", "h"}) {
+      Variable* x = scope_.NewVar(inlink);
+      DDim dims = make_ddim(std::vector<int>{
+          10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
+      x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
+    }
+    // create output alias just for test
+    for (auto inlink : std::vector<std::string>{"h@alias"}) {
+      Variable* x = scope_.NewVar(inlink);
+      DDim dims =
+          make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/});
+      x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
+    }
+
+    LOG(INFO) << "create global variable w";
+    Variable* w = scope_.NewVar("rnn/w");
+    w->GetMutable<Tensor>()->mutable_data<float>(
+        make_ddim(std::vector<int>{30, 30}), platform::CPUPlace());
+
+    for (auto boot : std::vector<std::string>{"h_boot"}) {
+      LOG(INFO) << "create global variable " << boot;
+      Variable* h_boot = scope_.NewVar(boot);
+      h_boot->GetMutable<Tensor>()->mutable_data<float>(
+          make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/}),
+          platform::CPUPlace());
+    }
+
+    LOG(INFO) << "create variable step_scopes";
+    scope_.NewVar("step_scopes");
+
+    LOG(INFO) << "create variable h";
+    scope_.NewVar("h");
+  }
+
+  void CreateRNNOp() {
+    OpDesc op_desc;
+
+    op_desc.set_type("recurrent_op");
+    // inlinks 0
+    op_desc.add_inputs("x");
+    op_desc.add_inputs("x0");
+    op_desc.add_inputs("x1");
+    // boot_memories 3
+    op_desc.add_inputs("h_boot");
+    // step net 5
+    op_desc.add_inputs("step_net");
+    // outlinks 6
+    op_desc.add_outputs("h");
+    // step scopes 7
+    op_desc.add_outputs("step_scopes");
+
+    auto _input_format = std::vector<int>{
+        0,  // in_link
+        3,  // memories
+        4   // step_net
+    };
+    auto input_format = op_desc.add_attrs();
+    input_format->set_name("input_format");
+    input_format->set_type(paddle::framework::AttrType::INTS);
+    for (auto i : _input_format) {
+      input_format->add_ints(i);
+    }
+
+    auto output_format = op_desc.add_attrs();
+    output_format->set_name("output_format");
+    output_format->set_type(paddle::framework::AttrType::INTS);
+    for (auto i : std::vector<int>{0, 1, 2}) {
+      output_format->add_ints(i);
+    }
+
+    auto inlink_alias = op_desc.add_attrs();
+    inlink_alias->set_name("inlink_alias");
+    inlink_alias->set_type(paddle::framework::AttrType::STRINGS);
+
+    auto outlink_alias = op_desc.add_attrs();
+    outlink_alias->set_name("outlink_alias");
+    outlink_alias->set_type(paddle::framework::AttrType::STRINGS);
+
+    auto pre_memories = op_desc.add_attrs();
+    pre_memories->set_name("pre_memories");
+    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
+
+    auto memories = op_desc.add_attrs();
+    memories->set_name("memories");
+    memories->set_type(paddle::framework::AttrType::STRINGS);
+
+    // create inlink_alias
+    for (const auto& item :
+         std::vector<std::string>{"x@alias", "x0@alias", "x1@alias"}) {
+      inlink_alias->add_strings(item);
+    }
+    // pre memories
+    for (const auto& item : std::vector<std::string>{"rnn/h@pre"}) {
+      pre_memories->add_strings(item);
+    }
+    // memories
+    for (const auto& item : std::vector<std::string>{"rnn/h"}) {
+      memories->add_strings(item);
+    }
+    // output alias
+    for (const auto& item : std::vector<std::string>{"h@alias"}) {
+      outlink_alias->add_strings(item);
+    }
+
+    rnn_op_ = OpRegistry::CreateOp(op_desc);
+
+    LOG(INFO) << "rnn_op finish init";
+  }
+
+  void CreateStepNet() {
+    LOG(INFO) << "create variable step_net";
+    Variable* var = scope_.NewVar("step_net");
+    auto net = var->GetMutable<NetOp>();
+    net->AddOp(
+        OpRegistry::CreateOp("mul", {"rnn/h@pre", "rnn/w"}, {"rnn/s"}, {}));
+
+    net->AddOp(
+        OpRegistry::CreateOp("add_two", {"x@alias", "rnn/s"}, {"rnn/h"}, {}));
+    net->CompleteAddOp();
+  }
+
+  // father scope
+  Scope scope_;
+  std::shared_ptr<OperatorBase> rnn_op_;
+};
+
+TEST_F(RecurrentOpTest, Run) {
+  platform::CPUDeviceContext ctx;
+  rnn_op_->InferShape(scope_);
+  rnn_op_->Run(scope_, ctx);
+}
+
+class RecurrentGradientAlgorithmTest : public ::testing::Test {
+protected:
+  virtual void SetUp() override {
+    CreateGlobalVariables();
+    CreateStepScopes();
+    CreateStepNet();
+    CreateRNNGradientAlgorithm();
+
+    // segment inputs
+    SegmentInputs();
+    // link forward memories
+    LinkeMemories();
+  }
+
+  virtual void TearDown() override {}
+
+  void CreateGlobalVariables() {
+    // inputs: x
+    LOG(INFO) << "create global variable x";
+    Variable* x = scope_.NewVar("x");
+    DDim dims =
+        make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
+    x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
+    // inputs: h_boot
+    LOG(INFO) << "create global variable h_boot";
+    Variable* h_boot = scope_.NewVar("h_boot");
+    h_boot->GetMutable<Tensor>()->mutable_data<float>(
+        make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace());
+    // inputs: w
+    LOG(INFO) << "create global variable w";
+    Variable* w = scope_.NewVar("rnn/w");
+    w->GetMutable<Tensor>()->mutable_data<float>(make_ddim({30, 30}),
+                                                 platform::CPUPlace());
+    // inputs: h_grad
+    LOG(INFO) << "create variable h_grad";
+    Variable* dh = scope_.NewVar("h_grad");
+    dh->GetMutable<Tensor>()->mutable_data<float>(make_ddim({10, 20, 30}),
+                                                  platform::CPUPlace());
+    // inputs: step_scopes
+    LOG(INFO) << "create variable step_scopes";
+    scope_.NewVar("step_scopes");
+    // inputs: step_net
+    LOG(INFO) << "create variable step_net";
+    scope_.NewVar("step_net");
+    // outputs: w_grad
+    LOG(INFO) << "create global variable w_grad";
+    scope_.NewVar("rnn/w_grad");
+    // outputs: x_grad
+    LOG(INFO) << "create global variable x_grad";
+    scope_.NewVar("x_grad");
+    // outputs: h_boot_grad
+    LOG(INFO) << "create global variable h_boot_grad";
+    scope_.NewVar("h_boot_grad");
+  }
+
+  void CreateStepScopes() {
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
+    for (int i = 0; i < 10; ++i) {
+      auto& scope = scope_.NewScope();
+      auto pre_t = scope.NewVar("rnn/pre_h")->GetMutable<Tensor>();
+      pre_t->mutable_data<float>({20, 30}, platform::CPUPlace());
+      auto tensor = scope.NewVar("rnn/h")->GetMutable<Tensor>();
+      tensor->mutable_data<float>({20, 30}, platform::CPUPlace());
+
+      // for unit test of ConcatOutputs
+      auto xg = scope.NewVar("rnn/x_grad")->GetMutable<Tensor>();
+      xg->mutable_data<float>({20, 30}, platform::CPUPlace());
+
+      step_scopes->emplace_back(&scope);
+    }
+
+    // last time step
+    auto g = (*step_scopes)[9]->NewVar("rnn/h_pre_grad")->GetMutable<Tensor>();
+    g->mutable_data<float>({20, 30}, platform::CPUPlace());
+  }
+
+  void CreateRNNGradientAlgorithm() {
+    std::unique_ptr<rnn::Argument> arg(new rnn::Argument());
+    arg->step_net = "step_net";
+    arg->step_scopes = "step_scopes";
+    rnn::Link inlink;
+    inlink.external = "h_grad";
+    inlink.internal = "rnn/h_grad";
+    arg->inlinks = std::vector<rnn::Link>{inlink};
+
+    rnn::Link outlink;
+    outlink.external = "x_grad";
+    outlink.internal = "rnn/x_grad";
+    arg->outlinks = std::vector<rnn::Link>{outlink};
+
+    rnn::MemoryAttr mem_attr;
+    mem_attr.pre_var = "rnn/h_pre_grad";
+    mem_attr.var = "rnn/h_grad";
+    mem_attr.boot_var = "h_boot_grad";
+    arg->memories = std::vector<rnn::MemoryAttr>{mem_attr};
+
+    rnn_grad_algo_.Init(std::move(arg));
+  }
+
+  void CreateStepNet() {
+    LOG(INFO) << "create variable step_net";
+    Variable* var = scope_.NewVar("step_net");
+    auto net = var->GetMutable<NetOp>();
+    net->AddOp(OpRegistry::CreateOp("mul",
+                                    {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
+                                    {"rnn/h_pre_grad", "rnn/w_grad"},
+                                    {}));
+
+    net->AddOp(OpRegistry::CreateOp(
+        "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {}));
+    net->CompleteAddOp();
+  }
+
+  void SegmentInputs() {
+    LOG(INFO) << "segment inputs";
+    std::vector<std::string> inlinks = {"x"};
+    std::vector<std::string> inlinks_alias = {"rnn/x"};
+
+    rnn::Link inlink;
+    inlink.external = "x";
+    inlink.internal = "rnn/x";
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
+    rnn::SegmentInputs(*step_scopes,
+                       std::vector<rnn::Link>{inlink},
+                       10,
+                       true /*infer_shape_mode*/);
+  }
+
+  void LinkeMemories() {
+    LOG(INFO) << "link memories";
+    rnn::MemoryAttr mem_attr;
+    mem_attr.pre_var = "rnn/h_pre";
+    mem_attr.var = "rnn/h";
+    mem_attr.boot_var = "boot_h";
+    std::vector<rnn::MemoryAttr> memories;
+    memories.push_back(mem_attr);
+    auto step_scopes =
+        scope_.FindVar("step_scopes")->GetMutable<std::vector<Scope*>>();
+    for (int i = 1; i < 10; ++i) {
+      rnn::LinkMemories(
+          *step_scopes, memories, i, -1, true /*infer_shape_mode*/);
+    }
+  }
+
+  Scope scope_;
+  RecurrentGradientAlgorithm rnn_grad_algo_;
+};
+
+// TEST_F(RecurrentGradientAlgorithmTest, Run) {
+//   platform::CPUDeviceContext ctx;
+//   rnn_grad_algo_.Run(scope_, ctx);
+// }
+
+}  // namespace operators
+}  // namespace paddle
+
+TEST(RecurrentOp, LinkMemories) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  // create and init step scopes
+  size_t len = 10;
+  std::vector<Scope*> step_scopes;
+  for (size_t i = 0; i < len; ++i) {
+    auto scope = new Scope();
+    scope->NewVar("pre_h");
+    auto tensor = scope->NewVar("h")->GetMutable<Tensor>();
+    float* data = tensor->mutable_data<float>({15, 20}, CPUPlace());
+    for (size_t j = 0; j < 15 * 20; ++j) {
+      data[j] = rand() * (1. / (double)RAND_MAX);
+    }
+    step_scopes.push_back(scope);
+  }
+
+  // create MemoryAttr
+  rnn::MemoryAttr mem_attr;
+  mem_attr.pre_var = "pre_h";
+  mem_attr.var = "h";
+  mem_attr.boot_var = "boot_h";
+  std::vector<rnn::MemoryAttr> memories;
+  memories.push_back(mem_attr);
+
+  for (size_t i = 1; i < len; ++i) {
+    rnn::LinkMemories(step_scopes, memories, i, -1, false /*infer_shape_mode*/);
+  }
+  // check
+  for (size_t i = 0; i < len - 1; ++i) {
+    const float* a =
+        step_scopes[i]->FindVar("h")->GetMutable<Tensor>()->data<float>();
+    const float* b = step_scopes[i + 1]
+                         ->FindVar("pre_h")
+                         ->GetMutable<Tensor>()
+                         ->data<float>();
+    for (size_t j = 0; j < 15 * 20; ++j) {
+      ASSERT_FLOAT_EQ(a[j], b[j]);
+    }
+  }
+
+  for (int i = len - 2; i >= 0; --i) {
+    rnn::LinkMemories(step_scopes, memories, i, 1, false /*infer_shape_mode*/);
+  }
+  // check
+  for (int i = len - 2; i >= 0; --i) {
+    const float* a =
+        step_scopes[i]->FindVar("pre_h")->GetMutable<Tensor>()->data<float>();
+    const float* b =
+        step_scopes[i + 1]->FindVar("h")->GetMutable<Tensor>()->data<float>();
+    for (size_t j = 0; j < 15 * 20; ++j) {
+      ASSERT_FLOAT_EQ(a[j], b[j]);
+    }
+  }
+
+  for (auto s : step_scopes) {
+    delete s;
+  }
+}
+
+USE_OP(add_two);
+USE_OP(mul);
diff --git a/paddle/operators/rnn_design.md b/paddle/operators/rnn_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d38b9a0ad225fd8e0c1bb037474b292b1887f5b
--- /dev/null
+++ b/paddle/operators/rnn_design.md
@@ -0,0 +1,239 @@
+# RNN 变长输入设计
+对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
+但不管是padding还是bucket，对于用户都是额外的使用负担。
+
+因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
+
+为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
+
+为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
+其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
+如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下：
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
+可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
+简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性，以实现`lod_start_pos`的传递：
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
+Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
+
+比如原始的输入：
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化，这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置，并添加一个新的接口 
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
+
+即下面的转变：
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ad2b66c8f385c858eb34c7ea766f168de9c817e
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/rowwise_add_op.h"
+namespace paddle {
+namespace operators {
+
+class RowWiseAddOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2UL,
+                   "Two inputs is needed by rowwise add");
+    auto dim0 = ctx.Input<Tensor>(0)->dims();
+    auto dim1 = ctx.Input<Tensor>(1)->dims();
+
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "The output size must be 1");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+  }
+};
+
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+public:
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(rowwise_add,
+                       ops::RowWiseAddKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..82338ceccc06653791b26472e18d804f62735649
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cu
@@ -0,0 +1,5 @@
+#define EIGEN_USE_GPU
+#include "paddle/operators/rowwise_add_op.h"
+
+REGISTER_OP_GPU_KERNEL(rowwise_add,
+                       ops::RowWiseAddKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd4d1128955fb718d3a84dfd96d8c68d7196e9cc
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class RowWiseAddKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto out = context.Output<Tensor>(0);
+    out->mutable_data<T>(context.GetPlace());
+
+    auto input = EigenMatrix<T>::From(*context.Input<Tensor>(0));
+    auto bias = EigenVector<T>::From(*context.Input<Tensor>(1));
+    auto output = EigenMatrix<T>::From(*out);
+
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+    Eigen::DSizes<int, 1> one_d(input.size());
+    Eigen::DSizes<int, 1> bcast(rest_size);
+    output.reshape(one_d).device(context.GetEigenDevice<Place>()) =
+        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9a84dc8af3b3e649b776ca8a97dedba1fa3ff48d
--- /dev/null
+++ b/paddle/operators/sgd_op.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SGDOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 2, "Input size of SGDOp must be two");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Output size of SGDOp must be one");
+    PADDLE_ENFORCE(ctx.InputVar(0) != nullptr, "inputs[0] mast be set");
+    PADDLE_ENFORCE(ctx.InputVar(1) != nullptr, "inputs[1] mast be set");
+    PADDLE_ENFORCE(ctx.OutputVar(0) != nullptr, "outputs[0] mast be set");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims() == ctx.Input<Tensor>(1)->dims(),
+                   "Two input of SGD Op's dimension must be same.");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+  }
+};
+
+class SGDOpMaker : public OpProtoAndCheckerMaker {
+public:
+  SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("param", "input parameter");
+    AddInput("grad", "input gradient");
+    AddOutput("param_out", "output parameter");
+    AddAttr<float>("learning_rate", "learning rate of sgd");
+    AddComment(R"DOC(
+
+Simplest sgd algorithm.
+
+param_out = param - learning_rate * grad;
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d79258cbf13c699cfb2afaee229cf96a3e377b5e
--- /dev/null
+++ b/paddle/operators/sgd_op.cu
@@ -0,0 +1,4 @@
+#define EIGEN_USE_GPU
+#include "paddle/operators/sgd_op.h"
+
+REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c3a240f9a4a5fc7bc4898e82786810cee2f7010
--- /dev/null
+++ b/paddle/operators/sgd_op.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class SGDOpKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& ctx) const override {
+    auto param = ctx.Input<Tensor>("param");
+    auto grad = ctx.Input<Tensor>("grad");
+    auto param_out = ctx.Output<Tensor>(0);
+    float lr = ctx.op_.GetAttr<float>("learning_rate");
+
+    param_out->mutable_data<T>(ctx.GetPlace());
+
+    auto p = EigenVector<T>::Flatten(*param);
+    auto g = EigenVector<T>::Flatten(*grad);
+    auto o = EigenVector<T>::Flatten(*param_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    o.device(place) = p - lr * g;
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sgd_op_test.cc b/paddle/operators/sgd_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..75137259f5e608b259b073101353e5818bb17c92
--- /dev/null
+++ b/paddle/operators/sgd_op_test.cc
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/framework/op_registry.h>
+USE_OP(sgd);
+TEST(SGDOp, GetOpProto) {
+  auto& protos = paddle::framework::OpRegistry::protos();
+  auto it = protos.find("sgd");
+  ASSERT_NE(it, protos.end());
+}
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a81ab262cc6fe7bdff0045259e0030f3d46f503f
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cc
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/sigmoid_op.h"
+namespace paddle {
+namespace operators {
+
+class SigmoidOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+  }
+};
+
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+public:
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddOutput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+
+class SigmoidOpGrad : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "SigmoidGrad";
+    return "";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
+REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);
+
+REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c9d11a2e1f9dcc563765c9e8cc1bae6beff57f18
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cu
@@ -0,0 +1,4 @@
+#define EIGEN_USE_GPU
+#include "paddle/operators/sigmoid_op.h"
+
+REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..1412e4398440c8e946d3ab434a50e978079637ab
--- /dev/null
+++ b/paddle/operators/sigmoid_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class SigmoidKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
+    output->mutable_data<T>(context.GetPlace());
+
+    auto X = EigenVector<T>::Flatten(*input);
+    auto Y = EigenVector<T>::Flatten(*output);
+    auto place = context.GetEigenDevice<Place>();
+
+    Y.device(place) = 1.0 / (1.0 + (-1.0 * X).exp());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5b59fad7d5f9729b0862f8cd78cb32f94f87f513
--- /dev/null
+++ b/paddle/operators/softmax_op.cc
@@ -0,0 +1,55 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {
+    PADDLE_ENFORCE(ctx.InputSize() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(ctx.Input<Tensor>(0)->dims().size() == 2,
+                   "The input of softmax op must be matrix");
+    PADDLE_ENFORCE(ctx.OutputSize() == 1,
+                   "Only one output is need for softmax");
+    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
+  }
+};
+
+class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
+public:
+  SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+
+class SoftmaxOpGrad : public OperatorWithKernel {
+protected:
+  void InferShape(const InferShapeContext &ctx) const override {}
+  std::string DebugString() const override {
+    LOG(INFO) << "SoftmaxOpGrad";
+    return "";
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ddf8f6e913ccf450185f377f531bf978f69ed1fc
--- /dev/null
+++ b/paddle/operators/softmax_op.cu
@@ -0,0 +1,5 @@
+#define EIGEN_USE_GPU
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/softmax_op.h"
+
+REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..75c5197697dada58e09f4cda41cea13af56e79a3
--- /dev/null
+++ b/paddle/operators/softmax_op.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/operators/type_alias.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class SoftmaxKernel : public OpKernel {
+public:
+  void Compute(const ExecutionContext& context) const override {
+    auto input = context.Input<Tensor>(0);
+    auto output = context.Output<Tensor>(0);
+    output->mutable_data<T>(context.GetPlace());
+
+    auto logits = EigenMatrix<T>::From(*input);
+    auto softmax = EigenMatrix<T>::From(*output);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(context.GetEigenDevice<Place>()) = shifted_logits.exp();
+
+    softmax.device(context.GetEigenDevice<Place>()) =
+        (softmax *
+         softmax.sum(along_class)
+             .inverse()
+             .eval()
+             .reshape(batch_by_one)
+             .broadcast(one_by_class));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h
new file mode 100644
index 0000000000000000000000000000000000000000..9049ffda1da5408411687474c5ed0c76c2394623
--- /dev/null
+++ b/paddle/operators/type_alias.h
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using OpKernel = framework::OpKernel;
+using InferShapeContext = framework::InferShapeContext;
+using ExecutionContext = framework::ExecutionContext;
+using Variable = framework::Variable;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+using OperatorWithKernel = framework::OperatorWithKernel;
+using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+using OpProto = framework::OpProto;
+using OpAttrChecker = framework::OpAttrChecker;
+using CPUPlace = platform::CPUPlace;
+using GPUPlace = platform::GPUPlace;
+using NetOp = framework::NetOp;
+using OpRegistry = framework::OpRegistry;
+using OperatorBase = framework::OperatorBase;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index 54662dc37891d3211950453b210db4b475837df4..eb7125adee769c97e16986cabf06ea389bf4c143 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -44,8 +44,8 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
                                           const int state_len) {
   paddle_optimizer* optimizer = new paddle_optimizer;
   std::string config(config_proto, config_proto + config_proto_len);
-  Tensor* parameter =
-      new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
+  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
+                                 num_bytes / sizeof(float));
   optimizer->impl = ParameterOptimizer::Create(config, parameter);
   if (state != nullptr) {
     std::string s(state, state + state_len);
@@ -65,7 +65,8 @@ int paddle_update_parameter(paddle_optimizer* o,
                             int num_bytes) {
   // TOOD(zhihong): datatype not work. need to add the runtime datatype
   auto grad_type = reinterpret_cast<const float*>(grad_buffer);
-  Tensor* gradient = new Tensor(const_cast<float*>(grad_type), num_bytes);
+  Tensor* gradient =
+      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
   o->impl->Update(gradient);
   return PADDLE_SUCCESS;
 }
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 4e6254d9e4dab48279b4a880695959526d30d70c..edf4ae37a9beee2911d23dd1ab23e67a18065b1b 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = (float)rand() / (float)RAND_MAX;
   }
   return param;
 }
 
-Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = i;
   }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 
   virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
   virtual void TearDown() {}
 
   void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
     config_.mutable_sgd()->set_momentum(0.0);
     config_.mutable_sgd()->set_decay(0.0);
     config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
     config_.mutable_adam()->set_beta_1(0.9);
     config_.mutable_adam()->set_beta_2(0.1);
     config_.mutable_adam()->set_epsilon(1e-3);
     config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
   }
 
   void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       opts_[i]->Update(g);
     }
@@ -91,8 +107,8 @@ public:
   }
 
 private:
-  std::vector<ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
 };
 
 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
index d2454140dc243b40ed8348578360b30894213838..e4d97cbdba545c4ba5adf5b30efd3fc9f3f744ee 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
@@ -1,19 +1,32 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "serialization.h"
 #include "gtest/gtest.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
   for (size_t i = 0; i < t.size(); ++i) {
     t[i] = i;
     t1[i] = 0;
   }
 
-  TensorProto proto;
-  TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
   for (size_t i = 0; i < t1.size(); ++i) {
     EXPECT_EQ(t1[i], t[i]);
   }
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index cc6b52e9271ff00e91f2ca172815c543eb99261d..bd77bb7daa50e0b273f110624ddf6f4b79a3ceab 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,6 +1,20 @@
-add_subdirectory(dynload)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_test(cuda_test SRCS cuda_test.cu)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+
+add_subdirectory(dynload)
+
+cc_test(enforce_test SRCS enforce_test.cc)
+
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
+
+cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..78e1fa9df56b1623bfd9a53c6a37524d29648afc
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cuda.h b/paddle/platform/cpu_info.h
similarity index 51%
rename from paddle/platform/cuda.h
rename to paddle/platform/cpu_info.h
index 5ed36c0f02549e29fc680eba097c9d851fa346e5..8df7c7b4bca5bc88f6ed95d6ab82c81b73918e92 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cpu_info.h
@@ -14,37 +14,19 @@ limitations under the License. */
 
 #pragma once
 
-#ifndef PADDLE_ONLY_CPU
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
+#include <stddef.h>
 
 namespace paddle {
 namespace platform {
 
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
 
-int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
 
-void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
 
 }  // namespace platform
 }  // namespace paddle
-
-#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fb195aa7c0a41b7417ff5cf63394046e9c72267
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,21 @@
+#include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu
deleted file mode 100644
index 4067dda2f19f7661722d8a14a27c7b32ed6afc92..0000000000000000000000000000000000000000
--- a/paddle/platform/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a928e097787db9deebe1c6eab263190caacac7eb
--- /dev/null
+++ b/paddle/platform/device_context.cc
@@ -0,0 +1,120 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+template <>
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
+    const {
+  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
+}
+
+CPUDeviceContext::CPUDeviceContext() {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+
+#ifndef PADDLE_ONLY_CPU
+
+template <>
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
+}
+
+CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+  SetDeviceId(place_.device);
+  // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
+  // here will cause segment fault. We must implement a class derived from
+  // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
+  // later. Please refer to the implementation of class EigenCudaStreamDevice
+  // in TensorFlow.
+  //
+  // We find that CUDA 7 introduces a new option, the per-thread default stream,
+  // that has two effects. Please refer to https://devblogs.nvidia.com/
+  // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
+  //
+  // So, we decide to use default stream and add –default-stream per-thread nvcc
+  // flag. Than, two threads with two CUDADeviceContexts will run parallelly.
+  eigen_stream_.reset(new Eigen::CudaStreamDevice());
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+CUDADeviceContext::~CUDADeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  if (cublas_handle_) {
+    PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  }
+
+  if (cudnn_handle_) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
+
+  if (curand_generator_) {
+    PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
+  }
+  eigen_stream_.reset();
+  eigen_device_.reset();
+}
+
+Place CUDADeviceContext::GetPlace() const { return place_; }
+
+void CUDADeviceContext::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(0));
+}
+
+Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+cublasHandle_t CUDADeviceContext::cublas_handle() {
+  if (!cublas_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+  }
+  return cublas_handle_;
+}
+
+cudnnHandle_t CUDADeviceContext::cudnn_handle() {
+  if (!cudnn_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+  }
+  return cudnn_handle_;
+}
+
+curandGenerator_t CUDADeviceContext::curand_generator() {
+  if (!curand_generator_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
+                                                  CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(
+        dynload::curandSetPseudoRandomGeneratorSeed(curand_generator_, seed_));
+  }
+  return curand_generator_;
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..2038fafe2e15ec2631726643695ac6cbc317fed9
--- /dev/null
+++ b/paddle/platform/device_context.h
@@ -0,0 +1,101 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/gpu_info.h"
+#define EIGEN_USE_GPU
+#endif
+#include <memory>
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  template <typename DeviceType>
+  DeviceType* get_eigen_device() const;
+};
+
+class CPUDeviceContext : public DeviceContext {
+ public:
+  CPUDeviceContext();
+  CPUDeviceContext(CPUPlace);
+  virtual ~CPUDeviceContext() {}
+
+  Eigen::DefaultDevice* eigen_device() const;
+
+  Place GetPlace() const override;
+
+ private:
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
+
+#ifndef PADDLE_ONLY_CPU
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(GPUPlace);
+  virtual ~CUDADeviceContext();
+
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const;
+
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const override;
+
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
+
+  // clang-format off
+  /*! \brief  Return cublas handle in the device context. */
+  cublasHandle_t    cublas_handle   ();
+
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t     cudnn_handle    ();
+
+  /*! \brief  Return curand handle in the device context. */
+  curandGenerator_t curand_generator();
+  // clang-format on
+
+ private:
+  GPUPlace place_;
+
+ private:
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+
+ private:
+  uint64_t seed_;
+
+  // clang-format off
+  cudnnHandle_t     cudnn_handle_     = nullptr;
+  cublasHandle_t    cublas_handle_    = nullptr;
+  curandGenerator_t curand_generator_ = nullptr;
+  // clang-format on
+};
+
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..af2ce17fc2238dda62e9888ebe9426edcd55d2bc
--- /dev/null
+++ b/paddle/platform/device_context_test.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+#include "gtest/gtest.h"
+
+using DEVICE_GPU = Eigen::GpuDevice;
+TEST(Device, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::DeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice* gpu_device =
+        device_context->template get_eigen_device<DEVICE_GPU>();
+    ASSERT_NE(nullptr, gpu_device);
+    delete device_context;
+  }
+}
+
+TEST(Device, CUDADeviceContext) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::CUDADeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    curandGenerator_t curand_handle = device_context->curand_generator();
+    ASSERT_NE(nullptr, curand_handle);
+    delete device_context;
+  }
+}
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index 9f829b70128655f018c59db32b95d3f1789da5fc..d205ead84598e04eea523be32139959a02e0dd83 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3dfdaefb2348346e8f917b1f6c758bf6d91a1a
--- /dev/null
+++ b/paddle/platform/dynload/cublas.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/cublas.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 258cc88031a71e9fee65b5445bd5537d6782e226..c44b7240a885c2ef71e550df645dbaded69f9944 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
 
 /**
  * The following macro definition can generate structs
@@ -34,10 +34,10 @@ void *cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
-    cublasStatus_t operator()(Args... args) {                       \
+    inline cublasStatus_t operator()(Args... args) {                \
       typedef cublasStatus_t (*cublasFunc)(Args...);                \
       std::call_once(cublas_dso_flag,                               \
                      paddle::platform::dynload::GetCublasDsoHandle, \
@@ -45,62 +45,46 @@ void *cublas_dso_handle = nullptr;
       void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
       return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name;  // struct DynLoad__##__name
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    inline template <typename... Args>           \
+    cublasStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
-// include all needed cublas functions in HPPL
-// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
+  __macro(cublasSgemv);                   \
+  __macro(cublasDgemv);                   \
+  __macro(cublasSgemm);                   \
+  __macro(cublasDgemm);                   \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched)
 
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
 
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
-#endif
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5e15b5efcdae6a1eed09f002eb2f4f2163035f
--- /dev/null
+++ b/paddle/platform/dynload/cudnn.cc
@@ -0,0 +1,28 @@
+#include <paddle/platform/dynload/cudnn.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index 0a9562c573cdfe059ef7caa39ba62efa87225e41..ef0dd85b083dc2335dd5c70d3dc5f59eda25daeb 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -23,12 +23,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
     auto operator()(Args... args) -> decltype(__name(args...)) {   \
@@ -39,17 +39,19 @@ void* cudnn_dso_handle = nullptr;
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
-  } __name; /* struct DynLoad__##__name */
+  };                                                               \
+  extern struct DynLoad__##__name __name
 
 #else
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
   struct DynLoad__##__name {                                     \
     template <typename... Args>                                  \
     auto operator()(Args... args) -> decltype(__name(args...)) { \
       return __name(args...);                                    \
     }                                                            \
-  } __name; /* struct DynLoad__##__name */
+  };                                                             \
+  extern DynLoad__##__name __name
 
 #endif
 
@@ -57,80 +59,73 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
+  __macro(cudnnSetTensor4dDescriptor);              \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
+  __macro(cudnnCreateTensorDescriptor);             \
+  __macro(cudnnDestroyTensorDescriptor);            \
+  __macro(cudnnCreateFilterDescriptor);             \
+  __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnDestroyFilterDescriptor);            \
+  __macro(cudnnCreateConvolutionDescriptor);        \
+  __macro(cudnnCreatePoolingDescriptor);            \
+  __macro(cudnnDestroyPoolingDescriptor);           \
+  __macro(cudnnSetConvolution2dDescriptor);         \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnCreate);                             \
+  __macro(cudnnDestroy);                            \
+  __macro(cudnnSetStream);                          \
+  __macro(cudnnActivationForward);                  \
+  __macro(cudnnConvolutionForward);                 \
+  __macro(cudnnConvolutionBackwardBias);            \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
+  __macro(cudnnTransformTensor);                    \
+  __macro(cudnnPoolingForward);                     \
+  __macro(cudnnPoolingBackward);                    \
+  __macro(cudnnSoftmaxBackward);                    \
+  __macro(cudnnSoftmaxForward);                     \
+  __macro(cudnnGetVersion);                         \
+  __macro(cudnnGetErrorString);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(cudnnAddTensor);                 \
+  __macro(cudnnConvolutionBackwardData);   \
+  __macro(cudnnConvolutionBackwardFilter);
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-
 // APIs available after R4:
 #if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
+  __macro(cudnnBatchNormalizationForwardInference); \
+  __macro(cudnnBatchNormalizationBackward);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 // APIs in R5
 #if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(cudnnCreateActivationDescriptor); \
+  __macro(cudnnSetActivationDescriptor);    \
+  __macro(cudnnGetActivationDescriptor);    \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c1fab992c98569d4a95b6e699d97d428511e48e
--- /dev/null
+++ b/paddle/platform/dynload/curand.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/curand.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}
+}
+}
\ No newline at end of file
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index 9dc0a25c0fbdc3f73f1dd82206e8940972b0b7f5..d8c46bc41e18d013a80cd0a9116a4b1a52bf5854 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
     curandStatus_t operator()(Args... args) {                       \
@@ -36,32 +36,29 @@ void *curand_dso_handle = nullptr;
       void *p_##__name = dlsym(curand_dso_handle, #__name);         \
       return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name; /* struct DynLoad__##__name */
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    template <typename... Args>                  \
+    curandStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)       \
-  __macro(curandDestroyGenerator)
-// clang-format on
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandDestroyGenerator);
 
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/dynamic_loader.cc b/paddle/platform/dynload/dynamic_loader.cc
index dd914e006d54c423ffea56ffaaafe7dcba416361..ae9a0a982c73de05821579d22b7f9ad99f24a92b 100644
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <string>
 #include "gflags/gflags.h"
 #include "glog/logging.h"
-#include "paddle/framework/enforce.h"
+#include "paddle/platform/enforce.h"
 
 DEFINE_string(cudnn_dir, "",
               "Specify path for loading libcudnn.so. For instance, "
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a42c777d1c2ebbc22fdb77b1100cc6fcf7ff35
--- /dev/null
+++ b/paddle/platform/enforce.h
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <execinfo.h>
+#include <paddle/string/printf.h>
+#include <iomanip>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+struct EnforceNotMet : public std::exception {
+  std::exception_ptr exp_;
+  std::string err_str_;
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    static constexpr int TRACE_STACK_LIMIT = 100;
+    try {
+      std::rethrow_exception(exp_);
+    } catch (const std::exception& exp) {
+      std::ostringstream sout;
+      sout << string::Sprintf("%s at [%s:%d]", exp.what(), f, l) << std::endl;
+      sout << "Call Stacks: " << std::endl;
+      void* call_stack[TRACE_STACK_LIMIT];
+      int sz = backtrace(call_stack, TRACE_STACK_LIMIT);
+      auto line = backtrace_symbols(call_stack, sz);
+      for (int i = 0; i < sz; ++i) {
+        sout << line[i] << std::endl;
+      }
+      free(line);
+      err_str_ = sout.str();
+    }
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
+// Because most enforce conditions would evaluate to true, we can use
+// __builtin_expect to instruct the C++ compiler to generate code that
+// always forces branch prediction of true.
+// This generates faster binary code. __builtin_expect is since C++11.
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    int stat, const Args&... args) {
+  if (UNLIKELY(!(stat))) {
+    throw std::runtime_error(string::Sprintf(args...));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudaError_t e, const Args&... args) {
+  if (UNLIKELY(e)) {
+    throw thrust::system_error(e, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    curandStatus_t stat, const Args&... args) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudnnStatus_t stat, const Args&... args) {
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                             string::Sprintf(args...));
+  }
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cublasStatus_t stat, const Args&... args) {
+  std::string err;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    err = "CUBLAS: not initialized, ";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    err = "CUBLAS: alloc failed, ";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    err = "CUBLAS: invalid value, ";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    err = "CUBLAS: arch mismatch, ";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    err = "CUBLAS: mapping error, ";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    err = "CUBLAS: execution failed, ";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    err = "CUBLAS: internal error, ";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    err = "CUBLAS: not supported, ";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    err = "CUBLAS: license error, ";
+  }
+  throw std::runtime_error(err + string::Sprintf(args...));
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+#define PADDLE_THROW(...)                                              \
+  do {                                                                 \
+    throw ::paddle::platform::EnforceNotMet(                           \
+        std::make_exception_ptr(                                       \
+            std::runtime_error(paddle::string::Sprintf(__VA_ARGS__))), \
+        __FILE__, __LINE__);                                           \
+  } while (0)
+
+#define PADDLE_ENFORCE(...)                                             \
+  do {                                                                  \
+    try {                                                               \
+      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+    } catch (...) {                                                     \
+      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                              __FILE__, __LINE__);      \
+    }                                                                   \
+  } while (0)
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/framework/enforce_test.cc b/paddle/platform/enforce_test.cc
similarity index 85%
rename from paddle/framework/enforce_test.cc
rename to paddle/platform/enforce_test.cc
index f8da1a192f63a54324d80725c9d2f156fb11a481..2ac31812a80d8dd57ce82234cb5835e029a46067 100644
--- a/paddle/framework/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,8 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <gtest/gtest.h>
-#include <paddle/framework/enforce.h>
+#include "paddle/platform/enforce.h"
+#include "gtest/gtest.h"
 
 TEST(ENFORCE, OK) {
   PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
@@ -23,13 +23,14 @@ TEST(ENFORCE, FAILED) {
   bool in_catch = false;
   try {
     PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
-  } catch (paddle::framework::EnforceNotMet err) {
+  } catch (paddle::platform::EnforceNotMet error) {
+    // your error handling code here
     in_catch = true;
     std::string msg = "Enforce is not ok 123 at all";
-    const char* what = err.what();
+    const char* what = error.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
     }
   }
   ASSERT_TRUE(in_catch);
-}
\ No newline at end of file
+}
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..edeb3ecd7bf8b87333813eee5b40f71030f6609f
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,109 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/enforce.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetDeviceCount() {
+  int count;
+  PADDLE_ENFORCE(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  PADDLE_ENFORCE(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  PADDLE_ENFORCE(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t &available, size_t &total) {
+  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
+  //       block on stream 0 to make sure the copy has completed
+  PADDLE_ENFORCE(
+      cudaStreamSynchronize(0),
+      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+}
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.h b/paddle/platform/gpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3a5f5f13fdd3dd59eb43465da4a64b0d8d95e5b
--- /dev/null
+++ b/paddle/platform/gpu_info.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cuda_runtime.h>
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the total number of GPU devices in system.
+int GetDeviceCount();
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t &available, size_t &total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
+
+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 0704820aa05079401eb56814d689d6e280311edb..b31515e1f028acac885a506ff1c20479407a05e3 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/platform/place.h"
 
 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {
 
 class PlacePrinter : public boost::static_visitor<> {
  public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
   void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..29dd0ded0ac75893da7e244d92725cd5e285efce
--- /dev/null
+++ b/paddle/pybind/CMakeLists.txt
@@ -0,0 +1,9 @@
+cc_library(paddle_pybind SHARED
+    SRCS pybind.cc
+    DEPS pybind python backward
+	fc_op
+	sgd_op
+	add_op
+	mean_op
+	cross_entropy_op
+	recurrent_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
new file mode 100644
index 0000000000000000000000000000000000000000..40ff164497f627c0b562b6d33bfb4bec590e4c85
--- /dev/null
+++ b/paddle/pybind/pybind.cc
@@ -0,0 +1,232 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+#include <fstream>
+#include <vector>
+
+#include "paddle/framework/backward.h"
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
+#include "paddle/platform/enforce.h"
+#include "paddle/platform/place.h"
+#include "paddle/pybind/tensor_bind.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+USE_OP(add_two);
+USE_OP(onehot_cross_entropy);
+USE_OP_WITHOUT_KERNEL(fc);
+USE_OP(sgd);
+USE_OP(mul);
+USE_OP(mean);
+USE_OP(sigmoid);
+USE_OP(softmax);
+USE_OP(rowwise_add);
+USE_OP_WITHOUT_KERNEL(recurrent_op);
+
+template <typename ClassType>
+void ExposeOperator(ClassType& m) {
+  m.def("infer_shape", &ClassType::type::InferShape)
+      .def("run", &ClassType::type::Run)
+      .def("type",
+           [](const typename ClassType::type& op) -> std::string {
+             return op.type_;
+           })
+      .def("outputs",
+           [](const typename ClassType::type& op) -> std::vector<std::string> {
+             return op.outputs_;
+           })
+      .def("__str__", &ClassType::type::DebugString);
+}
+
+static size_t UniqueIntegerGenerator() {
+  static std::atomic<size_t> generator;
+  return generator.fetch_add(1);
+}
+
+bool IsCompileGPU() {
+#ifdef PADDLE_ONLY_CPU
+  return false;
+#else
+  return true;
+#endif
+}
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of PaddlePaddle");
+
+  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
+        return paddle::pybind::CastToPyBuffer(self);
+      })
+      .def("get_dims",
+           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_dims",
+           [](pd::Tensor& self, const std::vector<int>& dim) {
+             self.Resize(pd::make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<float>(place);
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self, paddle::platform::CPUPlace& place) {
+             self.mutable_data<int>(place);
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self, paddle::platform::GPUPlace& place) {
+             self.mutable_data<int>(place);
+           })
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCPUTensorSetFromArray<int>)
+#ifndef PADDLE_ONLY_CPU
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyCUDATensorSetFromArray<int>)
+#endif
+      .def("shape",
+           [](pd::Tensor& self) { return pd::vectorize(self.dims()); });
+
+  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](pd::Variable& var, int val) -> void {
+             *var.GetMutable<int>() = val;
+           })
+      .def("get_int",
+           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](pd::Variable& self) -> pd::Tensor* {
+             return self.GetMutable<pd::Tensor>();
+           },
+           py::return_value_policy::reference)
+      .def("get_net",
+           [](pd::Variable& self) -> pd::NetOp* {
+             return self.GetMutable<pd::NetOp>();
+           },
+           py::return_value_policy::reference);
+
+  py::class_<pd::Scope>(m, "Scope", "")
+      .def("new_var",
+           [](pd::Scope& self, const std::string& name) -> pd::Variable* {
+             return self.NewVar(name);
+           },
+           py::return_value_policy::reference)
+      .def("find_var", &pd::Scope::FindVar, py::return_value_policy::reference)
+      .def(py::init<>())
+      .def("new_scope",
+           [](pd::Scope& self) -> pd::Scope* { return &self.NewScope(); },
+           py::return_value_policy::reference)
+      .def("drop_kids", &pd::Scope::DropKids);
+
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<py::bytes> {
+    auto& protos = pd::OpRegistry::protos();
+    std::vector<py::bytes> ret_values;
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
+      std::string str;
+      PADDLE_ENFORCE(it->second.SerializeToString(&str),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+      ret_values.push_back(py::bytes(str));
+    }
+    return ret_values;
+  });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+  // clang-format off
+  py::class_<paddle::platform::DeviceContext>(m, "DeviceContext")
+      .def_static("create",
+                  [](paddle::platform::CPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+                    return new paddle::platform::CPUDeviceContext();
+                  })
+      .def_static("create",
+                  [](paddle::platform::GPUPlace& place)
+                      -> paddle::platform::DeviceContext* {
+#ifdef PADDLE_ONLY_CPU
+                    PADDLE_THROW("GPUPlace is not supported in CPU device.");
+#else
+                    return new paddle::platform::CUDADeviceContext(place);
+#endif
+                  });
+  // clang-format on
+
+  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
+
+  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
+      m, "Operator");
+
+  operator_base.def_static("create", [](py::bytes protobin) {
+    pd::OpDesc desc;
+    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                   "Cannot parse user input to OpDesc");
+    PADDLE_ENFORCE(desc.IsInitialized(),
+                   "User OpDesc is not initialized, reason %s",
+                   desc.InitializationErrorString());
+    return pd::OpRegistry::CreateOp(desc);
+  });
+
+  operator_base.def("backward",
+                    [](const pd::OperatorBase& forwardOp,
+                       const std::unordered_set<std::string>& no_grad_vars) {
+                      return pd::Backward(forwardOp, no_grad_vars);
+                    });
+
+  ExposeOperator(operator_base);
+
+  py::class_<pd::NetOp, std::shared_ptr<pd::NetOp>> net(m, "Net");
+
+  net.def_static("create",
+                 []() -> std::shared_ptr<pd::NetOp> {
+                   auto retv = std::make_shared<pd::NetOp>();
+                   retv->type_ = "plain_net";
+                   return retv;
+                 })
+      .def("add_op", &pd::NetOp::AddOp)
+      .def("add_op",
+           [](pd::NetOp& self, const std::shared_ptr<pd::NetOp>& net) -> void {
+             self.AddOp(std::static_pointer_cast<pd::OperatorBase>(net));
+           })
+      .def("complete_add_op", &pd::NetOp::CompleteAddOp)
+      .def("complete_add_op",
+           [](std::shared_ptr<pd::NetOp>& self) { self->CompleteAddOp(); });
+  ExposeOperator(net);
+
+  m.def("unique_integer", UniqueIntegerGenerator);
+
+  m.def("is_compile_gpu", IsCompileGPU);
+
+  return m.ptr();
+}
diff --git a/paddle/pybind/tensor_bind.h b/paddle/pybind/tensor_bind.h
new file mode 100644
index 0000000000000000000000000000000000000000..def37219ccefd5435f1212c4e4daac5a351d76f4
--- /dev/null
+++ b/paddle/pybind/tensor_bind.h
@@ -0,0 +1,119 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <string>
+#include "paddle/framework/tensor.h"
+#include "paddle/memory/memcpy.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace pybind {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+      framework::Tensor dst_tensor;
+      if (paddle::platform::is_gpu_place(tensor.holder_->place())) {
+        dst_tensor.CopyFrom<CUR_TYPE>(tensor, platform::CPUPlace());
+      } else if (paddle::platform::is_cpu_place(tensor.holder_->place())) {
+        dst_tensor = tensor;
+      }
+      return py::buffer_info(
+          dst_tensor.mutable_data<CUR_TYPE>(dst_tensor.holder_->place()),
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(dst_tensor.dims()),
+          dims_outside,
+          strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyCPUTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::CPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <typename T>
+void PyCUDATensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array,
+    paddle::platform::GPUPlace &place) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.Resize(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(place);
+  paddle::platform::GpuMemcpySync(
+      dst, array.data(), sizeof(T) * array.size(), cudaMemcpyHostToDevice);
+}
+#endif
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ab60f1a38dd4cd1d9799c0019dccae5f1c7d4310..69ae0ea2d72c199a8e17c0595693e5e0b2f79ee1 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -148,14 +148,15 @@ cat >> /paddle/build/Dockerfile <<EOF
 ADD *.deb /
 # run paddle version to install python packages first
 RUN apt-get update &&\
-    apt-get install -y python-pip && pip install -U pip && \
+    apt-get install -y wget python-pip && pip install -U pip && \
     dpkg -i /*.deb ; apt-get install -f -y && \
     apt-get clean -y && \
     rm -f /*.deb && \
     paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
-
+ADD go/cmd/pserver/pserver /usr/bin/
+ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index bfa10c91553563bddac8c1b41bf21490fb89d3cf..56d290be4ab04a9f6974023159aa8571d27f8dd5 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 
 set -xe
 
-mkdir -p /paddle/build
-cd /paddle/build
-rm -f /paddle/install 2>/dev/null || true
+mkdir -p /paddle/build_android
+cd /paddle/build_android
+rm -rf /paddle/install 2>/dev/null || true
 cmake -DCMAKE_SYSTEM_NAME=Android \
       -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
       -DANDROID_ABI=armeabi-v7a \
@@ -21,6 +21,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 make -j `nproc`
 make install
-
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
new file mode 100755
index 0000000000000000000000000000000000000000..004067a8f55351509caaf2bbf6d5c349a4698a79
--- /dev/null
+++ b/paddle/scripts/travis/build_android.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
+TMP_DIR=$HOME/$JOB/tmp
+mkdir -p $TMP_DIR
+cd $TMP_DIR
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
+$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+cd $HOME
+rm -rf $TMP_DIR
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_android
+cd $TRAVIS_BUILD_DIR/build_android
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      ..
+
+make -j `nproc`
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index 4754bdd4c80de9700d92b0e33ecfdfc582f42813..ec499a839ac6593bac788f4cca5e33afbed73010 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to reformat your code and git push again." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
     exit 1
 }
 
@@ -13,8 +13,14 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
 
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 if ! pre-commit run -a ; then
-  git diff  --exit-code
+    git diff
+    exit 1
 fi
 
 trap : 0
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index db7c3e69804a6a8f0510ba376432fe560ae74442..0272529d1c9b2cb6000a26f1d4d80276d06bf27b 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,7 +35,7 @@ public:
 
   // We provide non-explicit singleton constructors so users can
   // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index f25ce2f7f06f6da0feab27da61b8e49689cbe213..e1558e3fdfbcf296be0ee64202132f53bf901be9 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -22,11 +22,23 @@ DECLARE_string(save_dir);
 namespace paddle {
 NewRemoteParameterUpdater::NewRemoteParameterUpdater(
     const OptimizationConfig &config, const std::string pserverSpec)
-    : parameterClient_(-1),
+    : trainerConfig_(config),
+      parameterClient_(-1),
       newParameters_(nullptr),
       newGradients_(nullptr),
       pserverSpec_(pserverSpec) {}
 
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
 void NewRemoteParameterUpdater::init(
     const std::vector<ParameterPtr> &parameters) {
   ParameterUpdater::init(parameters);
@@ -37,8 +49,13 @@ void NewRemoteParameterUpdater::init(
   }
 
   // create parameter server client.
-  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                               FLAGS_trainer_id == 0);
+  if (useEtcd_) {
+    parameterClient_ = paddle_new_etcd_pserver_client(
+        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
 
   // init new parameter and gradient.
   newParameters_ = initNewParameter(PARAMETER_VALUE);
@@ -51,7 +68,26 @@ void NewRemoteParameterUpdater::init(
     LOG(INFO) << "paddle_begin_init_params start";
     for (int i = 0; i < parameterSize(); ++i) {
       auto paramConfig = parameters_[i]->getConfig();
-      std::string bytes = paramConfig.SerializeAsString();
+      LOG(INFO) << "old param config: " << paramConfig.DebugString();
+      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
+      OptimizerConfig optimizeConfigV2;
+      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
+      sgdConfigV2->set_momentum(paramConfig.momentum());
+      sgdConfigV2->set_decay(paramConfig.decay_rate());
+      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      auto constlr = optimizeConfigV2.mutable_const_lr();
+      if (paramConfig.has_learning_rate()) {
+        constlr->set_learning_rate(paramConfig.learning_rate());
+      } else {
+        constlr->set_learning_rate(trainerConfig_.learning_rate());
+      }
+      if (trainerConfig_.algorithm() == "sgd") {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+        // FIXME: config all algorithms
+      } else {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      }
+      std::string bytes = optimizeConfigV2.SerializeAsString();
       const char *array = bytes.data();
       int size = (int)bytes.size();
       paddle_init_param(
@@ -83,4 +119,4 @@ void NewRemoteParameterUpdater::finishBatch(real cost) {
 void NewRemoteParameterUpdater::startPass() {}
 
 bool NewRemoteParameterUpdater::finishPass() { return true; }
-}
+}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index f735185f62b3491a63e34cfc4a2ef73dae12243e..6223ba427c9b94494c2bee8f0847442f1b0574c9 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <functional>
 #include <thread>
+#include "OptimizerConfig.pb.h"
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
 #include "paddle/pserver/ParameterClient2.h"
@@ -31,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater {
 public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
   ~NewRemoteParameterUpdater() {
     releaseNewParameter(newParameters_);
     releaseNewParameter(newGradients_);
@@ -101,6 +105,7 @@ private:
   }
 
 protected:
+  const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
   /// the parameters for new pserver client
@@ -109,6 +114,8 @@ protected:
   paddle_parameter** newGradients_;
   /// the specification of parameter server "host1:port,host1:port"
   std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 60ac8459a12db801321da4a9d9c1d48ac8bd6d16..133e2be104c6fbfddefd8698d2b6aa8315c56c70 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -62,11 +62,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
   m->conf = config;
 }
 
-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
 
 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 9b5ad21724afd7176f958619e7e10d12dc08fa49..2e5ff76a06152b6a12818f06baaeaa6a69726ba8 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 27ddaab3f003110a2684a871a2de17afb473d660..7cde98306026ca1de76089749aaea265d151da33 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -126,9 +126,11 @@ public:
   }
 
   /**
-   * @brief operator bool, return True if there is something error.
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
    */
-  operator bool() const { return !this->isOK(); }
+  void check() const { CHECK(this->isOK()) << msg(); }
 
   /**
    * @brief isOK return True if there is no error.
@@ -136,13 +138,6 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
 private:
   std::shared_ptr<std::string> msg_;
 };
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b5e2862546212041a774599ec664b95e56224a07..0a27b8b97b83a9066af23039a317c437ea56777a 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
     updateMap(p);
   }
 
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index fdf326b17a1c8baa87e2a17fafae253565d1e699..6f311fa6b80191de1e11ce1f63c31b64fe2eeb80 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_FALSE(error);
+  ASSERT_TRUE(error.isOK());
   error = paddle::Error("I'm the error");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("I'm the error", error.msg());
 
   error = paddle::Error("error2");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
   auto error3 = paddle::Error("error%d", i);
-  ASSERT_TRUE(error3);
+  ASSERT_FALSE(error3.isOK());
   ASSERT_STREQ("error3", error3.msg());
 }
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 37cd16c79890738f6d8966579e15686c653d4df3..b50b73c7e169f3e8ae75322d9a0a3cad5072a9c7 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -198,6 +198,11 @@ message RowConvConfig {
   required uint32 context_length = 1;
 }
 
+message SliceConfig {
+  required uint32 start = 1;
+  required uint32 end = 2;
+}
+
 message ProjectionConfig {
   required string type = 1;
   required string name = 2;
@@ -218,6 +223,10 @@ message ProjectionConfig {
 
   // For pool
   optional PoolConfig pool_conf = 12;
+
+  // For slice
+  // Each slice output is the input[start, end)
+  repeated SliceConfig slices = 13;
 }
 
 message OperatorConfig {
@@ -289,6 +298,11 @@ message DetectionOutputConfig {
   optional uint32 width = 9 [default = 1];
 }
 
+message ClipConfig {
+  required double min = 1;
+  required double max = 2;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -309,6 +323,7 @@ message LayerInputConfig {
   optional RowConvConfig row_conv_conf = 15;
   optional MultiBoxLossConfig multibox_loss_conf = 16;
   optional DetectionOutputConfig detection_output_conf = 17;
+  optional ClipConfig clip_conf = 18;
 }
 
 message LayerConfig {
@@ -472,10 +487,16 @@ message LayerConfig {
   // blank label used in ctc loss
   optional uint32 blank = 52 [default = 0];
 
-  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which 
+  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which
   // controls the scope of pooling operation. can be set > 0.
   // leave empty or set to -1 to disable this stride pooling.
   optional int32 seq_pool_stride = 53 [default = -1];
+
+  // for crop layer
+  optional int32 axis = 54 [default = 2];
+  repeated uint32 offset = 55;
+  repeated uint32 shape = 56;
+
 }
 
 message EvaluatorConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 13a1802ee3790b1255fc11f5b2053e3342c61914..0171f9d8ccd6045cb876d57684269a2a49e77f96 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,10 +26,17 @@ endif(WITH_GOLANG)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
+
+add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        DEPENDS paddle_pybind)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+
+
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 826ba2834a820d11e69feec5569ef3537194e3c3..9ea69fc5e57636c22fb20d5d97de760b9cc3bcde 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -565,6 +565,35 @@ class IdentityOffsetProjection(Projection):
         return []
 
 
+@config_class
+class SliceProjection(Projection):
+    type = 'slice'
+
+    def __init__(self, input_layer_name, slices, **xargs):
+        super(SliceProjection, self).__init__(input_layer_name, **xargs)
+        input = g_layer_map[input_layer_name]
+        if input.type in ["exconv", "cudnn_conv"]:
+            # the slice operator is for the channel dimension
+            assert input.num_filters is not None
+            channels = input.num_filters
+            image_size = input.size / channels
+            assert slices[len(slices) - 1][1] <= channels
+            for i in xrange(len(slices)):
+                slice = self.proj_conf.slices.add()
+                slice.start = slices[i][0] * image_size
+                slice.end = slices[i][1] * image_size
+                self.size += slice.end - slice.start
+        else:
+            config_assert(False,
+                          'Currently the input should be convolution layer')
+
+    def calc_parameter_size(self, input_size, output_size):
+        return 0
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return []
+
+
 # DotMulProjection performs element-wise multiplication with weight
 @config_class
 class DotMulProjection(Projection):
@@ -1575,7 +1604,13 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
         super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -1592,6 +1627,8 @@ class FCLayer(LayerBase):
             self.create_input_parameter(input_index, psize, dims, sparse,
                                         format)
         self.create_bias_parameter(bias, self.config.size)
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
 
 
 @config_layer('selective_fc')
@@ -1990,6 +2027,23 @@ class PadLayer(LayerBase):
         self.config.size = out_ch * out_h * out_w
 
 
+@config_layer('crop')
+class CropLayer(LayerBase):
+    def __init__(self, name, inputs, axis, offset, shape, **xargs):
+        super(CropLayer, self).__init__(name, 'crop', 0, inputs=inputs, **xargs)
+        self.config.axis = axis
+        self.config.offset.extend(offset)
+        self.config.shape.extend(shape)
+
+        # get channel, width and height from input_0 layer
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].image_conf
+        image_conf.img_size = input_layer.width
+        image_conf.img_size_y = input_layer.height
+        image_conf.channels = input_layer.size / (input_layer.width *
+                                                  input_layer.height)
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -2030,8 +2084,7 @@ class BatchNormLayer(LayerBase):
         # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU.
         # Also based on cudnn version.
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
-            ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4007
+                ((not parallel_nn) or self.config.device > -1)
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **xargs)
@@ -2145,6 +2198,20 @@ class RowConvLayer(LayerBase):
         self.create_input_parameter(0, psize, dims)
 
 
+@config_layer('clip')
+class ClipLayer(LayerBase):
+    def __init__(self, name, inputs, min, max, **xargs):
+        super(ClipLayer, self).__init__(name, 'clip', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'ClipLayer must have one and only one input.')
+        config_assert(min < max, 'min must be less than max.')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+        self.config.inputs[0].clip_conf.min = min
+        self.config.inputs[0].clip_conf.max = max
+
+
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -2701,6 +2768,16 @@ class SumToOneNormLayer(LayerBase):
         self.set_layer_size(input_layer0.size)
 
 
+@config_layer('row_l2_norm')
+class RowL2NormLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(RowL2NormLayer, self).__init__(
+            name, 'row_l2_norm', 0, inputs=inputs, **xargs)
+        config_assert(len(self.inputs) == 1, 'RowL2NormLayer must have 1 input')
+        input_layer = self.get_input_layer(0)
+        self.set_layer_size(input_layer.size)
+
+
 @config_layer('cos_vm')
 class CosSimVecMatLayer(LayerBase):
     def __init__(self, name, size, inputs, cos_scale=1.0, device=None):
@@ -3194,6 +3271,10 @@ def ParameterHook(type, **kwargs):
         if sparsity_ratio is not None:
             hook.sparsity_ratio = sparsity_ratio
         return hook
+    elif type == 'dpruning':
+        hook = ParameterUpdaterHookConfig()
+        hook.type = type
+        return hook
     else:
         return None
 
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 9b9f979bb615f37ec1dc9baa154d28741b1400d5..ecba87191045cff6c05014010e60575741238f8d 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -272,7 +272,7 @@ class ExtraLayerAttribute(object):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
                     not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s cannot support %s" %
+                raise NotImplementedError("Layer %s does not support %s" %
                                           (layer_name, key))
 
     @staticmethod
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b0524a507bacec6768424045e58bf91305de2d08..ea5fdcc50f6abbc67fb61b7fd56c100d9f9811d0 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -76,6 +76,7 @@ __all__ = [
     'trans_layer',
     'rotate_layer',
     'sum_to_one_norm_layer',
+    'row_l2_norm_layer',
     'get_output_layer',
     'LayerType',
     'context_projection',
@@ -126,6 +127,10 @@ __all__ = [
     'row_conv_layer',
     'dropout_layer',
     'prelu_layer',
+    'gated_unit_layer',
+    'crop_layer',
+    'clip_layer',
+    'slice_projection',
 ]
 
 
@@ -157,6 +162,7 @@ class LayerType(object):
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
+    ROW_L2_NORM_LAYER = 'row_l2_norm'
     ADDTO_LAYER = 'addto'
 
     CONCAT_LAYER = 'concat'
@@ -217,6 +223,8 @@ class LayerType(object):
     SMOOTH_L1 = 'smooth_l1'
 
     PRELU = 'prelu'
+    CROP_LAYER = 'crop'
+    CLIP_LAYER = 'clip'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -533,6 +541,45 @@ def identity_projection(input, offset=None, size=None):
     return proj
 
 
+def slice_projection(input, slices):
+    """
+    slice_projection can slice the input value into multiple parts,
+    and then select some of them to merge into a new output.
+
+    .. math::
+       output = [input.slices()]
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = slice_projection(input=layer, slices=[(0, 10), (20, 30)])
+
+    Note that slice_projection should not have any parameter.
+
+    :param input: Input Layer.
+    :type input: LayerOutput
+    :param slices: An array of slice parameters.
+                   Each slice contains the start and end offsets based
+                   on the input.
+    :type slices: pair of int
+    :return: A SliceProjection object
+    :rtype: SliceProjection
+    """
+    assert len(slices) >= 1
+    start = 0
+    for i in xrange(len(slices)):
+        assert len(slices[i]) == 2
+        # The start position of the next slice needs to be greater than
+        # or equal to the end position of the previous slice.
+        assert slices[i][0] >= start
+        assert slices[i][1] >= slices[i][0]
+        start = slices[i][1]
+    proj = SliceProjection(input_layer_name=input.name, slices=slices)
+    proj.origin = input
+    return proj
+
+
 @wrap_param_attr_default()
 def scaling_projection(input, param_attr=None):
     """
@@ -862,7 +909,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
 @wrap_name_default("embedding")
 @wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING)
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     """
     Define a embedding Layer.
@@ -1317,7 +1364,7 @@ def pooling_layer(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
 @wrap_name_default("lstmemory")
-@layer_support(DROPOUT)
+@layer_support()
 def lstmemory(input,
               name=None,
               size=None,
@@ -1426,7 +1473,7 @@ def lstmemory(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act"], act=TanhActivation())
 @wrap_name_default("gru")
-@layer_support(DROPOUT)
+@layer_support()
 def grumemory(input,
               size=None,
               name=None,
@@ -1790,7 +1837,7 @@ def repeat_layer(input,
 @wrap_name_default("seqreshape")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def seq_reshape_layer(input,
                       reshape_size,
                       act=None,
@@ -2700,7 +2747,7 @@ def img_cmrnorm_layer(input,
     default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def batch_norm_layer(input,
                      act=None,
                      name=None,
@@ -2780,15 +2827,6 @@ def batch_norm_layer(input,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    if not isinstance(act, ReluActivation):
-        logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % act.name)
-
-    if not isinstance(input.activation, LinearActivation):
-        logger.log(logging.WARN,
-                   "The activation should be inside batch normalization, the "
-                   "previous layer's activation may be Linear")
 
     if num_channels is None:
         if input.num_filters is not None:
@@ -2855,10 +2893,46 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
         name, LayerType.SUM_TO_ONE_NORM_LAYER, parents=[input], size=input.size)
 
 
+@wrap_name_default()
+@layer_support()
+def row_l2_norm_layer(input, name=None, layer_attr=None):
+    """
+    A layer for L2-normalization in each row.
+
+    .. math::
+       out[i] = \frac{in[i]}{\sqrt{\sum_{k=1}^N in[k]^{2}}}
+
+    where the size of :math:`in` is (batchSize x dataDim) ,
+    and the size of :math:`out` is a (batchSize x dataDim) .
+
+    The example usage is:
+
+    .. code-block:: python
+
+       row_l2_norm_layer = row_l2_norm_layer(input=layer)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.ROW_L2_NORM_LAYER,
+        inputs=[input.name],
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_L2_NORM_LAYER, parents=[input], size=input.size)
+
+
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     """
     AddtoLayer.
@@ -2937,7 +3011,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
 
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
@@ -3021,7 +3095,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 @wrap_name_default("seqconcat")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                      bias_attr=None):
     """
@@ -3170,8 +3244,8 @@ def memory(name,
 
 
 @wrap_bias_attr_default()
-@wrap_act_default(
-    param_names=['gate_act', 'state_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['state_act'], act=TanhActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
 @layer_support()
@@ -3528,12 +3602,7 @@ def SubsequenceInput(input):
 
 
 @wrap_name_default("recurrent_group")
-def recurrent_group(step,
-                    input,
-                    reverse=False,
-                    name=None,
-                    targetInlink=None,
-                    is_generating=False):
+def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
     """
     Recurrent layer group is an extremely flexible recurrent unit in
     PaddlePaddle. As long as the user defines the calculation done within a
@@ -3599,21 +3668,12 @@ def recurrent_group(step,
 
     :type targetInlink: LayerOutput|SubsequenceInput
 
-    :param is_generating: If is generating, none of input type should be LayerOutput;
-                          else, for training or testing, one of the input type must
-                          be LayerOutput.
-
-    :type is_generating: bool
-
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
     model_type('recurrent_nn')
 
-    def is_single_input(x):
-        return isinstance(x, LayerOutput) or isinstance(x, StaticInput)
-
-    if is_single_input(input):
+    if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
         input = [input]
     assert isinstance(input, collections.Sequence)
 
@@ -3627,13 +3687,8 @@ def recurrent_group(step,
         in_links=map(lambda x: x.name, in_links),
         seq_reversed=reverse)
     in_args = []
-    has_LayerOutput = False
     for each_input in input:
-        assert is_single_input(each_input)
-        if isinstance(each_input, LayerOutput):
-            in_args.append(each_input)
-            has_LayerOutput = True
-        else:  # StaticInput
+        if isinstance(each_input, StaticInput):  # StaticInput
             mem_name = "__%s_memory__" % each_input.input.name
             mem = memory(
                 name=None,
@@ -3641,24 +3696,26 @@ def recurrent_group(step,
                 boot_layer=each_input.input)
             mem.set_input(mem)
             in_args.append(mem)
-
-    assert (is_generating != has_LayerOutput)
+        else:
+            in_args.append(each_input)
 
     layer_outs = step(*in_args)
 
     if isinstance(layer_outs, LayerOutput):
         layer_outs = [layer_outs]
 
-    for ot in layer_outs:
-        assert isinstance(ot, LayerOutput)
-        ot.reverse = reverse
-        RecurrentLayerGroupSetOutLink(ot.name)
+    for layer_out in layer_outs:
+        assert isinstance(
+            layer_out, LayerOutput
+        ), "Type of step function's return value must be LayerOutput."
+        layer_out.reverse = reverse
+        RecurrentLayerGroupSetOutLink(layer_out.name)
 
     RecurrentLayerGroupEnd(name=name)
 
     for layer_out in layer_outs:
-        # Thee previous full_name is the name is the rnn group
-        # We need a full_name outside the rnn group
+        # The previous full_name is the name inside the recurrent group.
+        # We need a full_name outside the recurrent group.
         layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
 
     if len(layer_outs) == 1:
@@ -3681,7 +3738,20 @@ class BaseGeneratedInput(object):
 
 class GeneratedInput(BaseGeneratedInput):
     def after_real_step(self, input):
-        return maxid_layer(input=input, name='__beam_search_predict__')
+        if isinstance(input, LayerOutput):
+            input = [input]
+        elif isinstance(input, collections.Sequence):
+            input = list(input)
+            if len(input) > 1:
+                logger.info(
+                    ("More than one layers inside the recurrent_group "
+                     "are returned as outputs of the entire recurrent_group "
+                     "PLEASE garantee the first output is probability of "
+                     "the predicted next word."))
+
+        return [maxid_layer(
+            input=input[0], name='__beam_search_predict__')] + (
+                input[1:] if len(input) > 1 else [])
 
     def before_real_step(self):
         predict_id = memory(
@@ -3868,6 +3938,7 @@ def beam_search(step,
     :type step: callable
     :param input: Input data for the recurrent unit, which should include the
                   previously generated words as a GeneratedInput object.
+                  In beam_search, none of the input's type should be LayerOutput.
     :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
@@ -3909,15 +3980,18 @@ def beam_search(step,
 
     real_input = []
     for i, each_input in enumerate(input):
-        assert isinstance(each_input, StaticInput) or isinstance(
-            each_input, BaseGeneratedInput)
+        assert not isinstance(each_input, LayerOutput), (
+            "in beam_search, "
+            "none of the input should has a type of LayerOutput.")
         if isinstance(each_input, BaseGeneratedInput):
-            assert generated_input_index == -1
+            assert generated_input_index == -1, ("recurrent_group accepts "
+                                                 "only one GeneratedInput.")
             generated_input_index = i
+
         else:
             real_input.append(each_input)
 
-    assert generated_input_index != -1
+    assert generated_input_index != -1, "No GeneratedInput is given."
 
     gipt = input[generated_input_index]
 
@@ -3938,17 +4012,11 @@ def beam_search(step,
 
         predict = gipt.after_real_step(step(*args))
 
-        eos_layer(input=predict, eos_id=eos_id, name=eos_name)
+        eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
         return predict
 
-    tmp = recurrent_group(
-        step=__real_step__,
-        input=real_input,
-        reverse=False,
-        name=name,
-        is_generating=True)
-
-    return tmp
+    return recurrent_group(
+        step=__real_step__, input=real_input, reverse=False, name=name)
 
 
 def __cost_input__(input, label, weight=None):
@@ -4483,7 +4551,7 @@ def tensor_layer(a,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def selective_fc_layer(input,
                        size,
                        select=None,
@@ -5862,7 +5930,7 @@ def prelu_layer(input,
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
     assert isinstance(param_attr, ParameterAttribute)
 
     l = Layer(
@@ -5876,3 +5944,178 @@ def prelu_layer(input,
         layer_type=LayerType.PRELU,
         parents=input,
         size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: input for this layer.
+    :type input: LayerOutput
+    :param size: output size of the gated unit.
+    :type size: int
+    :param act: activation type of the projected input.
+    :type act: BaseActivation
+    :param name: name of this layer.
+    :type name: basestring
+    :param gate_attr: Attributes to tune the gate output, for example, error
+        clipping threshold, dropout and so on. See ExtraLayerAttribute for
+        more details.
+    :type gate_attr: ExtraLayerAttribute|None
+    :param gate_param_attr: Attributes to tune the learnable projected matrix
+        parameter of the gate.
+    :type gate_param_attr: ParameterAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param inproj_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_attr: ExtraLayerAttribute|None
+    :param inproj_param_attr: Attributes to tune the learnable parameter of
+        the projection of input.
+    :type inproj_param_attr: ParameterAttribute|None
+    :param inproj_bias_attr: Attributes to tune the learnable bias of
+        projection of the input.
+    :type inproj_bias_attr: ParameterAttribute|None
+    :param layer_attr: Attributes to tune the final output of the gated unit,
+        for example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        layer_attr=inproj_attr,
+        param_attr=inproj_param_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        layer_attr=gate_attr,
+        param_attr=gate_param_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
+
+
+@wrap_name_default()
+@layer_support()
+def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
+    """
+    The crop layer crops images by offset and shape. User can set crop shape by
+    args 'shape' explicitly or by reference input layer.
+
+    The example usage is:
+
+    .. code-block:: python
+    crop = crop_layer(input=[image_input, reference_input], axis=2, offset=[2, 3])
+
+    :param input: The input layer.If two inputs were setted,
+                    the second input will be regarded as reference input
+    :type input: LayerOutput or Sequence
+    :param offset: The crop offset
+    :type offset: Sequence
+    :param axis: start axis to be cropped. To image input layer:
+        - 0: batch size
+        - 1: channels
+        - 2: height
+        - 3: width
+    :type partial_sum: int
+    :param shape: The shape to be cropped. Default is None.
+    :type shape: Sequence | None
+    :param name: Name of this layer.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    else:
+        assert isinstance(input, collections.Sequence)
+    l = Layer(
+        inputs=[x.name for x in input],
+        axis=axis,
+        offset=offset,
+        shape=shape,
+        name=name,
+        type=LayerType.CROP_LAYER,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.CROP_LAYER,
+        parents=input,
+        size=l.config.size)
+
+
+@wrap_name_default("clip")
+def clip_layer(input, min, max, name=None):
+    """
+    A layer for clipping the input value by the threshold.
+
+    .. math::
+
+        out[i] = \min\left(\max\left(in[i],p_{1}\right),p_{2}\right)
+
+    .. code-block:: python
+
+        clip = clip_layer(input=input_layer, min=-10, max=10)
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput.
+    :param min: The lower threshold for clipping.
+    :type min: double
+    :param max: The upper threshold for clipping.
+    :type max: double
+    :return: LayerOutput
+    """
+    Layer(
+        name=name,
+        type=LayerType.CLIP_LAYER,
+        inputs=[input.name],
+        min=min,
+        max=max)
+    return LayerOutput(
+        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index b77932ce5f09470329a97cc0a6273942a9155c6a..34be203ee254584027c79cf93fe54f404b7235db 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -340,24 +340,40 @@ def img_conv_group(input,
                    conv_with_batchnorm=False,
                    conv_batchnorm_drop_rate=0,
                    pool_stride=1,
-                   pool_type=None):
+                   pool_type=None,
+                   param_attr=None):
     """
     Image Convolution Group, Used for vgg net.
 
-    TODO(yuyang18): Complete docs
-
-    :param conv_batchnorm_drop_rate:
-    :param input:
-    :param conv_num_filter:
-    :param pool_size:
-    :param num_channels:
-    :param conv_padding:
-    :param conv_filter_size:
-    :param conv_act:
-    :param conv_with_batchnorm:
-    :param pool_stride:
-    :param pool_type:
-    :return:
+    :param conv_batchnorm_drop_rate: if conv_with_batchnorm[i] is true,
+        conv_batchnorm_drop_rate[i] represents the drop rate of each batch norm.
+    :type conv_batchnorm_drop_rate: list
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param conv_num_filter: output channels num.
+    :type conv_num_filter: int
+    :param pool_size: pooling filter size.
+    :type pool_size: int
+    :param num_channels: input channels num.
+    :type num_channels: int
+    :param conv_padding: convolution padding size.
+    :type conv_padding: int
+    :param conv_filter_size: convolution filter size.
+    :type conv_filter_size: int
+    :param conv_act: activation funciton after convolution.
+    :type conv_act: BaseActivation
+    :param conv_with_batchnorm: conv_with_batchnorm[i] represents
+        if there is a batch normalization after each convolution.
+    :type conv_with_batchnorm: list
+    :param pool_stride: pooling stride size.
+    :type pool_stride: int
+    :param pool_type: pooling type.
+    :type pool_type: BasePoolingType
+    :param param_attr: Convolution param attribute.
+        None means default attribute.
+    :type param_attr: ParameterAttribute
+    :return: Layer's output
+    :type: LayerOutput
     """
     tmp = input
 
@@ -397,6 +413,7 @@ def img_conv_group(input,
             padding=conv_padding[i],
             filter_size=conv_filter_size[i],
             num_filters=conv_num_filter[i],
+            param_attr=param_attr,
             **extra_kwargs)
 
         # logger.debug("tmp.num_filters = %d" % tmp.num_filters)
@@ -614,18 +631,17 @@ def simple_lstm(input,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
-                   memory_boot=None,
+                   out_memory=None,
                    name=None,
                    size=None,
                    param_attr=None,
                    act=None,
                    gate_act=None,
                    state_act=None,
-                   mixed_bias_attr=None,
+                   input_proj_bias_attr=None,
+                   input_proj_layer_attr=None,
                    lstm_bias_attr=None,
-                   mixed_layer_attr=None,
-                   lstm_layer_attr=None,
-                   get_output_layer_attr=None):
+                   lstm_layer_attr=None):
     """
     Define calculations that a LSTM unit performs during a single time step.
     This function itself is not a recurrent layer, so it can not be
@@ -662,8 +678,8 @@ def lstmemory_unit(input,
 
     :param input: input layer name.
     :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
+    :param out_memory: output of previous time step
+    :type out_memory: LayerOutput | None
     :param name: lstmemory unit name.
     :type name: basestring
     :param size: lstmemory unit size.
@@ -676,33 +692,35 @@ def lstmemory_unit(input,
     :type gate_act: BaseActivation
     :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
+    :param input_proj_bias_attr: bias attribute for input-to-hidden projection.
+                False means no bias, None means default bias.
+    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                projection of the LSTM unit, such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
-                           False means no bias, None means default bias.
+                False means no bias, None means default bias.
     :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
     :param lstm_layer_attr: lstm layer's extra attribute.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
     :return: lstmemory unit name.
     :rtype: LayerOutput
     """
     if size is None:
         assert input.size % 4 == 0
         size = input.size / 4
-    out_mem = memory(name=name, size=size)
-    state_mem = memory(
-        name="%s_state" % name, size=size, boot_layer=memory_boot)
+    if out_memory is None:
+        out_mem = memory(name=name, size=size)
+    else:
+        out_mem = out_memory
+
+    state_mem = memory(name="%s_state" % name, size=size)
 
     with mixed_layer(
             name="%s_input_recurrent" % name,
             size=size * 4,
-            bias_attr=mixed_bias_attr,
-            layer_attr=mixed_layer_attr,
+            bias_attr=input_proj_bias_attr,
+            layer_attr=input_proj_layer_attr,
             act=IdentityActivation()) as m:
         m += identity_projection(input=input)
         m += full_matrix_projection(input=out_mem, param_attr=param_attr)
@@ -717,11 +735,7 @@ def lstmemory_unit(input,
         gate_act=gate_act,
         state_act=state_act,
         layer_attr=lstm_layer_attr)
-    get_output_layer(
-        name='%s_state' % name,
-        input=lstm_out,
-        arg_name='state',
-        layer_attr=get_output_layer_attr)
+    get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
 
     return lstm_out
 
@@ -730,17 +744,16 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                     size=None,
                     name=None,
-                    memory_boot=None,
+                    out_memory=None,
                     reverse=False,
                     param_attr=None,
                     act=None,
                     gate_act=None,
                     state_act=None,
-                    mixed_bias_attr=None,
+                    input_proj_bias_attr=None,
+                    input_proj_layer_attr=None,
                     lstm_bias_attr=None,
-                    mixed_layer_attr=None,
-                    lstm_layer_attr=None,
-                    get_output_layer_attr=None):
+                    lstm_layer_attr=None):
     """
     lstm_group is a recurrent_group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
@@ -774,8 +787,8 @@ def lstmemory_group(input,
     :type size: int
     :param name: name of the lstmemory group.
     :type name: basestring
-    :param memory_boot: the initialization state of LSTM cell.
-    :type memory_boot: LayerOutput | None
+    :param out_memory: output of previous time step
+    :type out_memory: LayerOutput | None
     :param reverse: is lstm reversed
     :type reverse: bool
     :param param_attr: Parameter config, None if use default.
@@ -786,18 +799,17 @@ def lstmemory_group(input,
     :type gate_act: BaseActivation
     :param state_act: lstm state activiation type.
     :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
     :param lstm_bias_attr: bias parameter attribute of lstm layer.
                            False means no bias, None means default bias.
     :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
+    :param input_proj_bias_attr: bias attribute for input-to-hidden projection.
+                False means no bias, None means default bias.
+    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                projection of the LSTM unit, such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
     :param lstm_layer_attr: lstm layer's extra attribute.
     :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
     :return: the lstmemory group.
     :rtype: LayerOutput
     """
@@ -805,18 +817,17 @@ def lstmemory_group(input,
     def __lstm_step__(ipt):
         return lstmemory_unit(
             input=ipt,
-            memory_boot=memory_boot,
             name=name,
             size=size,
-            mixed_bias_attr=mixed_bias_attr,
-            mixed_layer_attr=mixed_layer_attr,
-            param_attr=param_attr,
-            lstm_bias_attr=lstm_bias_attr,
             act=act,
             gate_act=gate_act,
             state_act=state_act,
+            out_memory=out_memory,
+            input_proj_bias_attr=input_proj_bias_attr,
+            input_proj_layer_attr=input_proj_layer_attr,
+            param_attr=param_attr,
             lstm_layer_attr=lstm_layer_attr,
-            get_output_layer_attr=get_output_layer_attr)
+            lstm_bias_attr=lstm_bias_attr)
 
     return recurrent_group(
         name='%s_recurrent_group' % name,
@@ -1395,7 +1406,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1408,6 +1419,8 @@ def outputs(layers, *args):
     :return:
     """
 
+    traveled = set()
+
     def __dfs_travel__(layer,
                        predicate=lambda x: x.layer_type == LayerType.DATA):
         """
@@ -1419,6 +1432,11 @@ def outputs(layers, *args):
         :type layer: LayerOutput
         :return:
         """
+        if layer in traveled:
+            return []
+        else:
+            traveled.add(layer)
+
         assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
         retv = []
         if layer.parents is not None:
@@ -1438,7 +1456,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a939c41ad01922e421f7bcd93851df7447a6799f..0ffa58bc1e2088f75e7cd25c7ecdffbe270825a4 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,7 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
+test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
index 7f2aa5a0fea1f4628e4effca5ce9af896f6e6c2c..75cf2312032e187dafc66199e933d3ad0fa33050 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -104,7 +104,7 @@ layers {
   }
   bias_parameter_name: "lstm_bias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
@@ -183,7 +183,7 @@ layers {
   }
   bias_parameter_name: "lstm_bias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4b9578a0c050ef74f186485fec3f6c1f7a0f0814
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_clip_layer.protostr
@@ -0,0 +1,31 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__clip_0__"
+  type: "clip"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    clip_conf {
+      min: -10
+      max: 10
+    }
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__clip_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__clip_0__"
+  input_layer_names: "input"
+  output_layer_names: "__clip_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index b7d74f85ab4ca3f434dfa45516dfee7227b6ceee..96fb1d4ebde08b1bca2ffd09e8db0895842cbfd3 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -131,6 +131,7 @@ input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
+output_layer_names: "__nce_layer_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -154,6 +155,7 @@ sub_models {
   input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
   output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1e4d894a5fb0040f48bdb5a751c3f0d956c23bb
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
@@ -0,0 +1,106 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8133aa9c8d3e7c6843d1b27b70e87d394a1e0e47
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
@@ -0,0 +1,497 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__addto_1__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+}
+layers {
+  name: "__addto_2__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+}
+layers {
+  name: "__addto_3__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+}
+layers {
+  name: "__addto_4__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+}
+layers {
+  name: "__addto_5__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+}
+layers {
+  name: "__addto_6__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+}
+layers {
+  name: "__addto_7__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+}
+layers {
+  name: "__addto_8__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+}
+layers {
+  name: "__addto_9__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+}
+layers {
+  name: "__addto_10__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+}
+layers {
+  name: "__addto_11__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+}
+layers {
+  name: "__addto_12__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+}
+layers {
+  name: "__addto_13__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+}
+layers {
+  name: "__addto_14__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+}
+layers {
+  name: "__addto_15__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+}
+layers {
+  name: "__addto_16__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+}
+layers {
+  name: "__addto_17__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+}
+layers {
+  name: "__addto_18__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+}
+layers {
+  name: "__addto_19__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+}
+layers {
+  name: "__addto_20__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+}
+layers {
+  name: "__addto_21__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+}
+layers {
+  name: "__addto_22__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+}
+layers {
+  name: "__addto_23__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+}
+layers {
+  name: "__addto_24__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+}
+layers {
+  name: "__addto_25__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+}
+layers {
+  name: "__addto_26__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+}
+layers {
+  name: "__addto_27__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+}
+layers {
+  name: "__addto_28__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+}
+layers {
+  name: "__addto_29__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+}
+layers {
+  name: "__addto_30__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+}
+layers {
+  name: "__addto_31__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 32
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__addto_31__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3200
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 32
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 32
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 320
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__addto_0__"
+  layer_names: "__addto_1__"
+  layer_names: "__addto_2__"
+  layer_names: "__addto_3__"
+  layer_names: "__addto_4__"
+  layer_names: "__addto_5__"
+  layer_names: "__addto_6__"
+  layer_names: "__addto_7__"
+  layer_names: "__addto_8__"
+  layer_names: "__addto_9__"
+  layer_names: "__addto_10__"
+  layer_names: "__addto_11__"
+  layer_names: "__addto_12__"
+  layer_names: "__addto_13__"
+  layer_names: "__addto_14__"
+  layer_names: "__addto_15__"
+  layer_names: "__addto_16__"
+  layer_names: "__addto_17__"
+  layer_names: "__addto_18__"
+  layer_names: "__addto_19__"
+  layer_names: "__addto_20__"
+  layer_names: "__addto_21__"
+  layer_names: "__addto_22__"
+  layer_names: "__addto_23__"
+  layer_names: "__addto_24__"
+  layer_names: "__addto_25__"
+  layer_names: "__addto_26__"
+  layer_names: "__addto_27__"
+  layer_names: "__addto_28__"
+  layer_names: "__addto_29__"
+  layer_names: "__addto_30__"
+  layer_names: "__addto_31__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index af1b63c5dfbf0984a20eda02d608f76a454613c6..711785be37dbe7f2decc161d1b8e1ead62927b20 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -258,7 +258,7 @@ layers {
   }
   bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
   active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
   name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..c2786ff55c7023d856d739face5e747cc5fee870
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_l2_norm_layer.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__row_l2_norm_layer_0__"
+  type: "row_l2_norm"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "__row_l2_norm_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__row_l2_norm_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__row_l2_norm_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
index 05810597b3154c3b287441465db16ee6e24b0ca2..565e281a6e1deff18aa48f97eb2f0e39ca79752f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -20,12 +20,13 @@ lstm1 = lstmemory_group(
     input=m1,
     param_attr=lstm_param,
     lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
+
 lstm2 = lstmemory_group(
     input=m2,
     param_attr=lstm_param,
     lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
 
 softmax_param = ParamAttr(name='softmax_param')
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f066fe1fb30877bf40bb6299d35546f7427989a5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_clip_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+clip = clip_layer(input=data, min=-10, max=10)
+
+outputs(clip)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
new file mode 100644
index 0000000000000000000000000000000000000000..8314a7e9a5586647c70ff010156817110919c72b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
@@ -0,0 +1,21 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+refernce_data = data_layer(name='data', size=768, height=16, width=16)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
+
+crop = crop_layer(input=[pool, refernce_data], axis=2)
+
+outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dab45519c65b0ca686558ec7fe2064bb9ad8824
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a693f8dff06dec6e71eeb488da9c807c35e4c9b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din = data_layer(name='data', size=100)
+
+enc = din
+for i in range(32):
+    enc = addto_layer([enc, enc])
+
+pred = fc_layer(
+    input=fc_layer(
+        input=enc, size=32, act=ReluActivation()),
+    size=10,
+    act=SoftmaxActivation())
+outputs(pred)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac8badb26a40e96e75225e6f61aa536cd28e9098
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_l2_norm_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+row_l2_norm = row_l2_norm_layer(input=data)
+
+outputs(row_l2_norm)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3ba5c31871807027e452df5d889b3b403e1c6414..5bea980611904b37a4a5d4e2cbbee13503a61ff0 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,10 +30,11 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
+import model
+import paddle.trainer.config_parser as cp
 
 __all__ = [
     'optimizer',
@@ -47,7 +47,6 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
-    'data_feeder',
     'dataset',
     'reader',
     'topology',
@@ -57,10 +56,14 @@ __all__ = [
     'evaluator',
     'image',
     'master',
+    'model',
 ]
 
+cp.begin_parse()
+
 
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
     args = []
     args_dict = {}
     # NOTE: append arguments if they are in ENV
@@ -73,6 +76,11 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
+    if 'use_gpu' in kwargs:
+        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
+    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
+                                         "supported in v2 APIs.")
+
     api.initPaddle(*args)
 
 
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index 2698251b9e15046eb14f71c3f5b0546ecbb4a5dd..98dfb85a0ea57050bf8dd8d46fca9574801d8eb3 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 2e4beb6882789249db09705f3f4d6c5c19e492cd..90830515c1e8e6f5260cfca631e02a3a52cedbe5 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -26,8 +26,9 @@ import sentiment
 import wmt14
 import mq2007
 import flowers
+import voc2012
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007', 'flowers'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc2012'
 ]
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 4a2eb59c340f5d0d3818170e56d730330e0bab29..111496618dfa997246d0a067b0cd4c7dad74f9dc 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -22,6 +22,8 @@ import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
+import random
 
 __all__ = [
     'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -164,55 +166,37 @@ def cluster_files_reader(files_pattern,
     return reader
 
 
-def convert(output_path,
-            reader,
-            num_shards,
-            name_prefix,
-            max_lines_to_shuffle=1000):
+def convert(output_path, reader, line_count, name_prefix):
     import recordio
-    import cPickle as pickle
-    import random
     """
     Convert data from reader to recordio format files.
 
     :param output_path: directory in which output files will be saved.
     :param reader: a data reader, from which the convert program will read data instances.
-    :param num_shards: the number of shards that the dataset will be partitioned into.
     :param name_prefix: the name prefix of generated files.
     :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
     """
 
-    assert num_shards >= 1
-    assert max_lines_to_shuffle >= 1
-
-    def open_writers():
-        w = []
-        for i in range(0, num_shards):
-            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
-                                        num_shards - 1)
-            w.append(recordio.writer(n))
-
-        return w
-
-    def close_writers(w):
-        for i in range(0, num_shards):
-            w[i].close()
+    assert line_count >= 1
+    indx_f = 0
 
-    def write_data(w, lines):
+    def write_data(indx_f, lines):
         random.shuffle(lines)
-        for i, d in enumerate(lines):
-            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
-            w[i % num_shards].write(d)
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            writer.write(cPickle.dumps(l))
+        writer.close()
 
-    w = open_writers()
     lines = []
-
     for i, d in enumerate(reader()):
         lines.append(d)
-        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
-            write_data(w, lines)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
             lines = []
+            indx_f += 1
             continue
 
-    write_data(w, lines)
-    close_writers(w)
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index e2a21e6e3e04e79fdfc225ce1b4496b6b69d1e89..634388094c804827657dc83d5c205e680625b156 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -116,7 +116,7 @@ def reader_creator(data_file,
             data = batch['data']
             labels = batch['label']
             for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label)
+                yield sample, int(label) - 1
 
     if use_xmap:
         return xmap_readers(mapper, reader, cpu_count(), buffered_size)
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index fd71b341662ca6f540ce44a86348e782561a97d7..b705c9109b2b6769c9fafa9241db5d81c682f9e3 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"):
         for j in range(i + 1, len(querylist)):
             query_right = querylist[j]
             if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_left.feature_vector),
                     np.array(query_right.feature_vector)
                 ])
             elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_right.feature_vector),
                     np.array(query_left.feature_vector)
                 ])
     for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]
 
 
 def gen_list(querylist):
@@ -242,9 +242,9 @@ def gen_list(querylist):
     if not isinstance(querylist, QueryList):
         querylist = QueryList(querylist)
     querylist._correct_ranking_()
-    relevance_score_list = [query.relevance_score for query in querylist]
+    relevance_score_list = [[query.relevance_score] for query in querylist]
     feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list).T, np.array(feature_vector_list)
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
 
 
 def query_filter(querylists):
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31e72ebf5eac0508d12783f9ceaa6eef0fa6d353
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..617e212d67fbe37f9d9663e9c83c62045411fa77
--- /dev/null
+++ b/python/paddle/v2/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.v2.dataset.common import download
+from paddle.v2.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index e1dc4f4c30051202e8fd077087679c4fd6cbd7a0..2a631c365f27a6039021a56268a62017638c2739 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -32,9 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
 URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# this is the pretrained model, whose bleu = 26.92
+# BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
-MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
 END = "<e>"
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index fd6050fa339d280ad54e40128ea6bae25132c873..7589cc9917f26375d595e200245d5ba099bc38d7 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -9,8 +9,6 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
     'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
 ]
@@ -18,6 +16,7 @@ __all__ = [
 
 class WithMetric(object):
     def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
         if not isinstance(evaluator, api.Evaluator):
             raise TypeError("Evaluator should be api.Evaluator type")
         self.__evaluator__ = evaluator
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..b034efffb69030cb09e09ea545e9bff6f1744671
--- /dev/null
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -0,0 +1,253 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import cStringIO
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from Paddle C++
+    :return: list of OpProto
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+class OpDescCreationMethod(object):
+    """
+    A Functor object to convert user input(use key word args) to OpDesc based on
+    OpProto.
+    
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, op_proto_pb2.OpProto):
+            raise TypeError("Argument should be OpProto")
+        self.__op_proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user input to OpDesc. Only key-word args are supported. 
+        :return: OpDesc based on user input
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments is supported by Paddle")
+        op_desc = op_desc_pb2.OpDesc()
+
+        # Inputs
+        ipts, ipt_format, _ = OpDescCreationMethod.extract_input_or_output(
+            "input", kwargs, self.__op_proto__.inputs)
+        op_desc.inputs.extend(ipts)
+        if ipt_format is not None:
+            op_desc.attrs.extend([ipt_format])
+
+        # Outputs
+        outs, out_format, tmp_index = OpDescCreationMethod.extract_input_or_output(
+            "output", kwargs, self.__op_proto__.outputs)
+        op_desc.outputs.extend(outs)
+        if out_format is not None:
+            op_desc.attrs.extend([out_format])
+        if len(tmp_index) != 0:
+            tmp_index_attr = op_desc.attrs.add()
+            tmp_index_attr.type = attr_type_pb2.INTS
+            tmp_index_attr.name = "temporary_index"
+            tmp_index_attr.ints.extend(tmp_index)
+
+        # Types
+        op_desc.type = self.__op_proto__.type
+
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == attr_type_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == attr_type_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == attr_type_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == attr_type_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                else:
+                    raise NotImplementedError("Not support attribute type " +
+                                              attr.type)
+
+        return op_desc
+
+    @staticmethod
+    def extract_input_or_output(in_out, kwargs, meta):
+        """
+        Extract input variable names or output variable names from key-word 
+        arguments, which base on VarProtos.
+        
+        :param in_out: "input" or "output"
+        :param kwargs: key-word arguments that user inputted.
+        :param meta: a list of VarProto
+        :return: The three object will be return. The variable names. The 
+        input_format or output_format attribute(None if the input or output is 
+        not multiple). The temporary variable index list.
+        """
+        multiple = OpDescCreationMethod.any_is_true((m.multiple for m in meta))
+        tmp_index = []
+        retv = []
+        if multiple:
+            var_format = op_desc_pb2.AttrDesc()
+            var_format.type = attr_type_pb2.INTS
+            var_format.name = "%s_format" % in_out
+            var_format.ints.append(0)
+
+            for var in meta:
+                var_name = var.name
+
+                if var.temporary:
+                    var_name = [core.var_names.temp()]
+                    tmp_index.append(len(retv))
+                else:
+                    var_name = kwargs.get(var_name, [])
+                if not isinstance(var_name, list):
+                    var_name = [var_name]
+                retv.extend(var_name)
+                var_format.ints.append(len(var_name) + var_format.ints[-1])
+            return retv, var_format, tmp_index
+        else:
+            for var in meta:
+                if var.temporary:
+                    retv.append(kwargs.get(var.name, core.var_names.temp()))
+                    tmp_index.append(len(retv))
+                else:
+                    retv.append(kwargs.get(var.name, core.var_names.empty()))
+            return retv, None, tmp_index
+
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a bool array to one. If any of them is True, then return True.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+
+
+def get_docstring_from_op_proto(op_proto):
+    """
+    Generate docstring from a OpProto
+    :param op_proto: a OpProto instance.
+    :type op_proto: op_proto_pb2.OpProto
+    :return: docstring
+    """
+    if not isinstance(op_proto, op_proto_pb2.OpProto):
+        raise TypeError("Input must be OpProto")
+    f = cStringIO.StringIO()
+    f.write(op_proto.comment)
+    f.write("\n")
+
+    def __append_param__(name, comment, type):
+        # Maybe replace the following line with template engine is better.
+        f.write(":param ")
+        f.write(name)
+        f.write(": ")
+        f.write(comment)
+        f.write("\n")
+        f.write(":type ")
+        f.write(name)
+        f.write(": ")
+        f.write(type)
+        f.write("\n")
+
+    for ipt in op_proto.inputs:
+        __append_param__(ipt.name, ipt.comment, "list | basestr"
+                         if ipt.multiple else "basestr")
+
+    temp_var_prefix = \
+        "This is a temporary variable. It does not have to set by user. "
+    for opt in op_proto.outputs:
+        __append_param__(opt.name, opt.comment if not opt.temporary else
+                         temp_var_prefix + opt.comment, "list | basestr"
+                         if opt.multiple else "basestr")
+
+    for attr in op_proto.attrs:
+        attr_type = None
+        if attr.type == attr_type_pb2.INT:
+            attr_type = "int"
+        elif attr.type == attr_type_pb2.FLOAT:
+            attr_type = "float"
+        elif attr.type == attr_type_pb2.STRING:
+            attr_type = "basestr"
+        elif attr.type == attr_type_pb2.INTS:
+            attr_type = "list of int"
+        elif attr.type == attr_type_pb2.FLOATS:
+            attr_type = "list of float"
+        elif attr.type == attr_type_pb2.STRINGS:
+            attr_type = "list of basestr"
+
+        if attr_type is None:
+            raise RuntimeError("Not supported attribute type " + attr.type)
+
+        __append_param__(attr.name, attr.comment, attr_type)
+
+    return f.getvalue()
+
+
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto
+    """
+    method = OpDescCreationMethod(op_proto)
+
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+
+    __impl__.__doc__ = get_docstring_from_op_proto(op_proto)
+    __impl__.all_input_args = [var.name for var in op_proto.inputs]
+    __impl__.all_output_args = [var.name for var in op_proto.outputs]
+    __impl__.all_attr_args = [attr.name for attr in op_proto.attrs]
+    __impl__.all_not_temp_output_args = [
+        var.name for var in op_proto.outputs if not var.temporary
+    ]
+
+    return __impl__
+
+
+class OpCreationsHolder(object):
+    """
+    A object will holds all op creation methods.
+    
+    Use `op_creations.xxx_op` to access them.
+    """
+    pass
+
+
+op_creations = OpCreationsHolder()
+
+
+def __bootstrap__():
+    """
+    Bootstrap function for this module. It will dynamic create all op creation
+    methods in runtime.
+    """
+    for op_proto in get_all_op_protos():
+        func = create_op_creation_method(op_proto)
+        func.__name__ = str(op_proto.type)
+        setattr(op_creations, func.__name__, func)
+
+
+__bootstrap__()
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b5580c8b30f69016f187b1d8710a57b5f7cfa9f
--- /dev/null
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -0,0 +1,84 @@
+"""
+Default scope function.
+
+`Paddle` manages Scope as programming language's scope.  It just a 
+thread-local stack of Scope. Top of that stack is current scope, the bottom 
+of that stack is all scopes' parent. 
+
+Invoking `new_var/find_var`  can `new/find` variable in current scope. 
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
+scope. 
+
+A `scoped_function` will take a `function` as input. That function will be 
+invoked in a new local scope. 
+"""
+
+import paddle.v2.framework.core
+import threading
+
+__tl_scope__ = threading.local()
+
+__all__ = [
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'new_var',
+    'find_var', 'scoped_function'
+]
+
+
+def get_cur_scope():
+    """
+    Get current scope.
+    :rtype: paddle.v2.framework.core.Scope
+    """
+    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
+    if cur_scope_stack is None:
+        __tl_scope__.cur_scope = list()
+    if len(__tl_scope__.cur_scope) == 0:
+        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope())
+    return __tl_scope__.cur_scope[-1]
+
+
+def enter_local_scope():
+    """
+    Enter a new local scope
+    """
+    cur_scope = get_cur_scope()
+    new_scope = cur_scope.new_scope()
+    __tl_scope__.cur_scope.append(new_scope)
+
+
+def leave_local_scope():
+    """
+    Leave local scope
+    """
+    __tl_scope__.cur_scope.pop()
+    get_cur_scope().drop_kids()
+
+
+def new_var(name):
+    """
+    create variable in current scope.
+    """
+    return get_cur_scope().new_var(name)
+
+
+def find_var(name):
+    """
+    get variable in current scope.
+    """
+    return get_cur_scope().find_var(name)
+
+
+def scoped_function(func):
+    """
+    invoke `func` in new scope.
+    
+    :param func: a callable function that will be run in new scope.
+    :type func: callable
+    """
+    enter_local_scope()
+    try:
+        func()
+    except:
+        raise
+    finally:
+        leave_local_scope()
diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfeb0e3dec0fd2c6ad4d2d2501f97932495fdd41
--- /dev/null
+++ b/python/paddle/v2/framework/network.py
@@ -0,0 +1,131 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+from default_scope_funcs import new_var, find_var, get_cur_scope
+
+__all__ = ['Network']  # Only expose Network
+
+
+class NetworkFunctor(object):
+    """
+    Network Op Creation Function. Used internally in this module.
+    It convert string input to Variable. If it is not created before, just 
+    create in scope.
+    
+    It is a functor object. means the instances are callable.
+    
+    :param func: The op creation function which generated in Python.
+    :param net: The Network instance.
+    """
+
+    def __init__(self, func, net):
+        self.func = func
+        self.net = net
+
+    def __call__(self, *args, **kwargs):
+        if len(args) != 0:
+            raise ValueError("Paddle must use keyword argument")
+        inputs = self.func.all_input_args
+        for ipt in inputs:
+            if ipt in kwargs:
+                var = kwargs[ipt]
+                if isinstance(var, basestring):
+                    tmp = new_var(var)
+                    self.net.var_names[tmp] = var
+                    var = tmp
+
+                if not isinstance(var, core.Variable):
+                    raise TypeError(
+                        "Input of op creation must be string or variable")
+
+                kwargs[ipt] = self.net.var_names[var]
+
+        notemp_outputs = self.func.all_not_temp_output_args
+
+        for name in notemp_outputs:
+            if name not in kwargs:
+                kwargs[
+                    name] = self.func.__name__ + "@OUT@%d" % core.unique_integer(
+                    )
+
+        outputs = self.func.all_output_args
+        for opt in outputs:
+            if opt in kwargs:
+                var = kwargs[opt]
+                if isinstance(var, basestring):
+                    tmp = new_var(var)
+                    self.net.var_names[tmp] = var
+                    var = tmp
+
+                if not isinstance(var, core.Variable):
+                    raise TypeError(
+                        "Output of op creation must be string or variable")
+                kwargs[opt] = self.net.var_names[var]
+
+        op = self.func(**kwargs)
+
+        self.net.net.add_op(op)
+
+        lst = [find_var(kwargs[opt]) for opt in notemp_outputs]
+        if len(lst) == 1:
+            return lst[0]
+        elif len(lst) == 0:
+            return None
+        else:
+            return lst
+
+
+class Network(object):
+    """
+    The network concept. It avoid user to manually create operator, create 
+    variable, and combine them into a Net. Just use Network.xxx can create the
+    operator, create variables in default scope, and add them into `self.net`.
+    
+    For example:
+    
+    ..  code-block: python
+    
+        net = Network()
+        out = net.add_two(X="a", Y="b")
+        fc_out = net.fc(X="out", W="fc.w")
+        
+        net.run(...)
+    """
+
+    def __init__(self):
+        self.net = core.Net.create()
+        funcs = (func_name for func_name in dir(op_creations)
+                 if not func_name.startswith("__"))
+        self.var_names = dict()
+
+        # TODO(yuyang18): This code can work, but do not generate a good
+        # docstring, try to give a better way generate function in runtime
+        # later.
+        for func_name in funcs:
+            func = getattr(op_creations, func_name)
+            impl = NetworkFunctor(func, self)
+            setattr(self, func_name, impl.__call__)
+        self.__complete_add_op__ = False
+
+    def infer_shape(self):
+        self.complete_add_op()
+        self.net.infer_shape(get_cur_scope())
+
+    def run(self, device_context):
+        self.complete_add_op()
+        self.net.run(get_cur_scope(), device_context)
+
+    def __str__(self):
+        return str(self.net)
+
+    def complete_add_op(self):
+        if not self.__complete_add_op__:
+            self.net.complete_add_op()
+            self.__complete_add_op__ = True
+
+
+if __name__ == '__main__':
+    net = Network()
+    out = net.add_two(X="a", Y="b")
+    fc_out = net.fc(X=out, W="fc.w", b="fc.b", activation="softmax")
+    net.complete_add_op()
+    print net
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 8cb0c5c3765a00b45177117925e320e61a1b609a..4619b0edc3dd7e253e01f7fee5e6a8641340d291 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1 +1,16 @@
-add_python_test(test_framework test_protobuf.py)
+add_python_test(test_framework
+    test_protobuf.py
+    test_scope.py
+    test_default_scope_funcs.py
+    test_op_creation_methods.py
+    test_net.py
+    test_tensor.py
+    test_fc_op.py
+    test_add_two_op.py
+    test_sgd_op.py
+    test_mul_op.py
+    test_mean_op.py
+    test_sigmoid_op.py
+    test_softmax_op.py
+    test_rowwise_add_op.py
+    test_network.py)
diff --git a/python/paddle/v2/framework/tests/op_test_util.py b/python/paddle/v2/framework/tests/op_test_util.py
new file mode 100644
index 0000000000000000000000000000000000000000..98fae1b975ad6243b20e5c19ec6ff68d5536cd74
--- /dev/null
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -0,0 +1,70 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+import paddle.v2.framework.create_op_creation_methods as creation
+
+
+class OpTestMeta(type):
+    """
+    Operator Test ClassMeta.
+    
+    It injects `test_all` method into user's OperatorTest class, to make Python 
+    unittest module run that method.
+    
+    The `test_all` read what value is stored in `self`. It use self's values to
+    create and run a operator, and check whether that op is OK or not.
+    
+    See `test_add_two_op` for example usage.
+    """
+
+    def __new__(cls, name, bases, attrs):
+        obj = super(OpTestMeta, cls).__new__(cls, name, bases, attrs)
+
+        def test_all(self):
+            func = getattr(creation.op_creations, self.type, None)
+            self.assertIsNotNone(func)
+
+            scope = core.Scope()
+            kwargs = dict()
+            places = []
+            places.append(core.CPUPlace())
+            if core.is_compile_gpu():
+                places.append(core.GPUPlace(0))
+
+            for place in places:
+                for in_name in func.all_input_args:
+                    if hasattr(self, in_name):
+                        kwargs[in_name] = in_name
+                        var = scope.new_var(in_name).get_tensor()
+                        arr = getattr(self, in_name)
+                        var.set_dims(arr.shape)
+                        var.set(arr, place)
+                    else:
+                        kwargs[in_name] = "@EMPTY@"
+
+                for out_name in func.all_output_args:
+                    if hasattr(self, out_name):
+                        kwargs[out_name] = out_name
+                        scope.new_var(out_name).get_tensor()
+
+                for attr_name in func.all_attr_args:
+                    if hasattr(self, attr_name):
+                        kwargs[attr_name] = getattr(self, attr_name)
+
+                op = func(**kwargs)
+
+                op.infer_shape(scope)
+
+                ctx = core.DeviceContext.create(place)
+                op.run(scope, ctx)
+
+                for out_name in func.all_output_args:
+                    actual = numpy.array(scope.find_var(out_name).get_tensor())
+                    expect = getattr(self, out_name)
+                    # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
+                    # has some diff, and could not pass unittest. So I set decimal 3 here.
+                    # And I will check this in future.
+                    numpy.testing.assert_almost_equal(actual, expect, decimal=3)
+
+        obj.test_all = test_all
+        return obj
diff --git a/python/paddle/v2/framework/tests/test_add_two_op.py b/python/paddle/v2/framework/tests/test_add_two_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e6643201bf361fce1bad7de10b2562f0525e00a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
@@ -0,0 +1,30 @@
+import unittest
+
+import numpy
+import paddle.v2.framework.core as core
+import paddle.v2.framework.create_op_creation_methods as creation
+
+from op_test_util import OpTestMeta
+
+
+class TestAddOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "add_two"
+        self.X = numpy.random.random((102, 105)).astype("float32")
+        self.Y = numpy.random.random((102, 105)).astype("float32")
+        self.Out = self.X + self.Y
+
+
+class TestAddGradOp(unittest.TestCase):
+    def test_add_grad(self):
+        op = creation.op_creations.add_two(X="X", Y="Y", Out="Out")
+        backward_op = core.Operator.backward(op, set())
+        self.assertEqual(backward_op.type(), "add_two_grad")
+        expected = '''Op(add_two_grad), inputs:(X, Y, Out, Out@GRAD), outputs:(X@GRAD, Y@GRAD).'''
+        self.assertEqual(expected, str(backward_op))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_cross_entropy_op.py b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..609c56535ef0365dda728cba334d8b4d96312192
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
@@ -0,0 +1,22 @@
+import unittest
+import numpy
+from op_test_util import OpTestMeta
+
+
+class TestSGD(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "onehot_cross_entropy"
+        batch_size = 100
+        class_num = 10
+        self.X = numpy.random.random((batch_size, class_num)).astype("float32")
+        self.label = 5 * numpy.ones(batch_size).astype("int32")
+        Y = []
+        for i in range(0, batch_size):
+            Y.append(-numpy.log(self.X[i][self.label[i]]))
+        self.Y = numpy.array(Y).astype("float32")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..495863c4562b5a2d6755fb02e21a6b0c845fd7b6
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -0,0 +1,33 @@
+from paddle.v2.framework.default_scope_funcs import *
+import unittest
+
+
+class TestDefaultScopeFuncs(unittest.TestCase):
+    def test_cur_scope(self):
+        self.assertIsNotNone(get_cur_scope())
+
+    def test_none_variable(self):
+        self.assertIsNone(find_var("test"))
+
+    def test_create_var_get_var(self):
+        var_a = new_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
+        enter_local_scope()
+        self.assertIsNotNone(get_cur_scope().find_var('var_a'))
+        leave_local_scope()
+
+    def test_var_get_int(self):
+        def __new_scope__():
+            i = new_var("var_i")
+            self.assertFalse(i.is_int())
+            i.set_int(10)
+            self.assertTrue(i.is_int())
+            self.assertEqual(10, i.get_int())
+
+        for _ in xrange(10):
+            scoped_function(__new_scope__)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..00dc4399aaf59e6382692c3a4356f89a7e79a0c5
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
@@ -0,0 +1,45 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+import paddle.v2.framework.create_op_creation_methods as creation
+
+
+class TestFc(unittest.TestCase):
+    def test_fc(self):
+        scope = core.Scope()
+        place = core.CPUPlace()
+        x = scope.new_var("X")
+
+        x_tensor = x.get_tensor()
+        x_tensor.set_dims([1000, 784])
+        x_tensor.alloc_float(place)
+
+        w = scope.new_var("W")
+        w_tensor = w.get_tensor()
+        w_tensor.set_dims([784, 100])
+        w_tensor.alloc_float(place)
+
+        w_tensor.set(numpy.random.random((784, 100)).astype("float32"), place)
+
+        # Set a real numpy array here.
+        # x_tensor.set(numpy.array([]))
+
+        op = creation.op_creations.fc(X="X", Y="Y", W="W")
+
+        for out in op.outputs():
+            if scope.find_var(out) is None:
+                scope.new_var(out).get_tensor()
+
+        tensor = scope.find_var("Y").get_tensor()
+        op.infer_shape(scope)
+        self.assertEqual([1000, 100], tensor.shape())
+
+        ctx = core.DeviceContext.create(place)
+
+        op.run(scope, ctx)
+
+        # After complete all ops, check Y is expect or not.
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mean_op.py b/python/paddle/v2/framework/tests/test_mean_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..78fff1eeff998109a51ea662f963a102eff49d3a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_mean_op.py
@@ -0,0 +1,16 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestMeanOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mean"
+        self.X = np.random.random((32, 784)).astype("float32")
+        self.Out = np.mean(self.X)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_mul_op.py b/python/paddle/v2/framework/tests/test_mul_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1ac66d3a4d23d617f7c5a4d97d070b2660954c8
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
@@ -0,0 +1,17 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mul"
+        self.X = np.random.random((32, 84)).astype("float32")
+        self.Y = np.random.random((84, 100)).astype("float32")
+        self.Out = np.dot(self.X, self.Y)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
new file mode 100644
index 0000000000000000000000000000000000000000..db776d6b643dc4014da9f5dded8219180af639e3
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -0,0 +1,30 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+import unittest
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = core.Net.create()
+        op1 = op_creations.add_two(X="X", Y="Y", Out="Out")
+        net.add_op(op1)
+
+        net2 = core.Net.create()
+        net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out"))
+        net2.complete_add_op(True)
+        net.add_op(net2)
+        net.complete_add_op(True)
+
+        expected = '''
+Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+    Op(add_two), inputs:(X, Y), outputs:(Out).
+    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
+        Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
+            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
+            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+'''
+        self.assertEqual(expected, "\n" + str(net))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_network.py b/python/paddle/v2/framework/tests/test_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d53e233e959bd39b558ac97cdca381135505f8d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_network.py
@@ -0,0 +1,32 @@
+from paddle.v2.framework.network import Network
+import paddle.v2.framework.core as core
+import unittest
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = Network()
+        out = net.add_two(X="X", Y="Y")
+        fc_out = net.fc(X=out, W="w")
+        net.complete_add_op()
+        self.assertTrue(isinstance(fc_out, core.Variable))
+        self.assertEqual(
+            '''Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, add_two@OUT@0, fc@OUT@1).
+    Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@0).
+    Op(fc), inputs:(add_two@OUT@0, w, @EMPTY@), outputs:(fc@OUT@1, @TEMP@fc@0).
+        Op(mul), inputs:(add_two@OUT@0, w), outputs:(@TEMP@fc@0).
+        Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc@OUT@1).
+''', str(net))
+
+        net2 = Network()
+        tmp = net2.add_two(X="X", Y="Y")
+        self.assertTrue(isinstance(tmp, core.Variable))
+        net2.complete_add_op()
+        self.assertEqual(
+            '''Op(plain_net), inputs:(X, Y), outputs:(add_two@OUT@2).
+    Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@2).
+''', str(net2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..41db7c0d535aa920b34d6cc346090a8c15bfb110
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -0,0 +1,254 @@
+import unittest
+import paddle.v2.framework.create_op_creation_methods as creation
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+
+
+class TestGetAllProtos(unittest.TestCase):
+    def test_all(self):
+        all_protos = creation.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+
+        ipt = op.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+
+        opt = op.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+
+        op.comment = "not matter"
+
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+        output = method(X="a", Y="b", Z="c")
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(["a", "b"])
+        expected.outputs.append("c")
+        self.assertEqual(expected, output)
+
+    def test_multiple_input_plain_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "fc"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+
+        out = op.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = op_desc_pb2.OpDesc()
+        expected1.inputs.extend(['x', 'w', 'b'])
+        expected1.outputs.extend(['y'])
+        expected1.type = 'fc'
+        attr = expected1.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3])
+        self.assertEqual(expected1, generated1)
+
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = op_desc_pb2.OpDesc()
+        expected2.inputs.extend(['x1', 'x2', 'x3', 'w1', 'w2', 'w3', 'b'])
+        expected2.outputs.extend(['y'])
+        expected2.type = 'fc'
+        attr = expected2.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 3, 6, 7])
+        self.assertEqual(expected2, generated2)
+
+    def test_attrs(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+
+        def __add_attr__(name, type):
+            attr = op.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+
+        __add_attr__("int_attr", attr_type_pb2.INT)
+        __add_attr__("float_attr", attr_type_pb2.FLOAT)
+        __add_attr__("string_attr", attr_type_pb2.STRING)
+        __add_attr__("ints_attr", attr_type_pb2.INTS)
+        __add_attr__("floats_attr", attr_type_pb2.FLOATS)
+        __add_attr__("strings_attr", attr_type_pb2.STRINGS)
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(['a'])
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = attr_type_pb2.INT
+        attr.i = 10
+
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = attr_type_pb2.FLOAT
+        attr.f = 3.2
+
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = attr_type_pb2.STRING
+        attr.s = "test_str"
+
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = attr_type_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = attr_type_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+
+        self.assertEqual(expected, generated)
+
+    def test_input_temporary_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        out = op.outputs.add()
+        out.name = "OUT"
+        out.comment = ""
+
+        out = op.outputs.add()
+        out.name = "TMP"
+        out.comment = ""
+        out.temporary = True
+
+        out = op.outputs.add()
+        out.name = "OUT2"
+        out.comment = ""
+        op.comment = ""
+
+        method = creation.OpDescCreationMethod(op)
+        generated = method(OUT="a", OUT2="b")
+        desc = op_desc_pb2.OpDesc()
+        desc.outputs.extend(["a", core.var_names.temp(), "b"])
+        desc.type = "test"
+        attr = desc.attrs.add()
+        attr.name = "temporary_index"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.append(2)
+        self.assertEqual(generated, desc)
+
+
+class TestOpCreationDocStr(unittest.TestCase):
+    def test_all(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        op.comment = """Test Op.
+
+This op is used for unit test, not a real op.
+"""
+        a = op.inputs.add()
+        a.name = "a"
+        a.comment = "Input a for test op"
+        a.multiple = True
+
+        b = op.inputs.add()
+        b.name = "b"
+        b.comment = "Input b for test op"
+        self.assertTrue(op.IsInitialized())
+
+        o1 = op.outputs.add()
+        o1.name = "output"
+        o1.comment = "The output of test op"
+
+        o2 = op.outputs.add()
+        o2.name = "temp output"
+        o2.comment = "The temporary output of test op"
+        o2.temporary = True
+
+        test_str = op.attrs.add()
+        test_str.name = "str_attr"
+        test_str.type = attr_type_pb2.STRING
+        test_str.comment = "A string attribute for test op"
+
+        actual = creation.get_docstring_from_op_proto(op)
+        expected_docstring = '''Test Op.
+
+This op is used for unit test, not a real op.
+
+:param a: Input a for test op
+:type a: list | basestr
+:param b: Input b for test op
+:type b: basestr
+:param output: The output of test op
+:type output: basestr
+:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op
+:type temp output: basestr
+:param str_attr: A string attribute for test op
+:type str_attr: basestr
+'''
+        self.assertEqual(expected_docstring, actual)
+
+
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = creation.op_creations.add_two(X="a", Y="b", Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).',
+                         str(add_op))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py
index f0e60191991e2f24b0a1972afb0f6cbd3aaa4008..b8702477e64203e735bff05b115eafbb2a52172d 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -24,3 +24,7 @@ class TestFrameworkProto(unittest.TestCase):
         attr.type = attr_type_lib.FLOAT
         op_proto.type = "cos"
         self.assertTrue(op_proto.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0457e3f16a709140180ce433c1d56d146f0b6974
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -0,0 +1,92 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+import paddle.v2.framework.create_op_creation_methods as creation
+
+ops = creation.op_creations
+
+
+def create_tensor(scope, name, shape):
+    tensor = scope.create_var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.alloc_float()
+    tensor.set(np.random.random(shape))
+    return tensor
+
+
+class TestRNN(unittest.TestCase):
+    '''
+    Test RNNOp
+
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+        - h
+    '''
+
+    def init(self):
+        input_dim = 30
+        batch_size = 50
+        weight_dim = 15
+
+        self.scope = core.Scope(None)
+
+        # create vars
+        create_tensor(self.scope, "x", [batch_size, input_dim])
+        create_tensor(self.scope, "W", [input_dim, weight_dim])
+        create_tensor(self.scope, "U", [weight_dim, weight_dim])
+        create_tensor(self.scope, "h_boot", [batch_size, weight_dim])
+
+        x_alias = "x@alias"
+        y_alias = "y@alias"
+        memory = "h@alias"
+        prememory = "h@pre"
+        output = "rnn_out"
+        output_alias = "rnn_out@alias"
+
+        # create step net
+        stepnet_var = self.scope.create_var("stepnet")
+        stepnet = stepnet_var.get_net()
+        # stepnet = core.Net.create()
+        x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx")
+        h_fc_op = ops.fc(X=prememory, W="U", Y="Uh")
+        sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum")
+        sig_op = ops.sigmoid(X="sum", Y=memory)
+        stepnet.add_op(x_fc_op)
+        stepnet.add_op(h_fc_op)
+        stepnet.add_op(sum_op)
+        stepnet.add_op(sig_op)
+        stepnet.complete_add_op(True)
+
+        # create RNNOp
+        rnnop = ops.recurrent_op(
+            # inputs
+            inlinks=["x"],
+            boot_memories=["h_boot"],
+            step_net="stepnet",
+            # outputs
+            outlinks=[output],
+            step_scopes="step_scopes",
+            # attributes
+            inlink_alias=["x@alias"],
+            outlink_alias=[output_alias],
+            pre_memories=[prememory],
+            memories=[memory])
+
+        ctx = core.DeviceContext.cpu_context()
+        rnnop.infer_shape(self.scope)
+        rnnop.run(self.scope, ctx)
+
+    def test_recurrent(self):
+        self.init()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_rowwise_add_op.py b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..04abc14ee198fe4e2307e009c696a2b40ec271b6
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
@@ -0,0 +1,17 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestRowwiseAddOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "rowwise_add"
+        self.X = np.random.random((32, 84)).astype("float32")
+        self.b = np.random.random(84).astype("float32")
+        self.Out = np.add(self.X, self.b)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ce9454067f91f39f01d9eb4c912857464a3c1cb
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -0,0 +1,37 @@
+import paddle.v2.framework.core
+import unittest
+
+
+class TestScope(unittest.TestCase):
+    def test_create_destroy(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope()
+        self.assertIsNotNone(scope)
+        scope_with_parent = scope.new_scope()
+        self.assertIsNotNone(scope_with_parent)
+
+    def test_none_variable(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope()
+        self.assertIsNone(scope.find_var("test"))
+
+    def test_create_var_get_var(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope()
+        var_a = scope.new_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(scope.find_var('var_a'))
+        scope2 = scope.new_scope()
+        self.assertIsNotNone(scope2.find_var('var_a'))
+
+    def test_var_get_int(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope()
+        var = scope.new_var("test_int")
+        var.set_int(10)
+        self.assertTrue(var.is_int())
+        self.assertEqual(10, var.get_int())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sgd_op.py b/python/paddle/v2/framework/tests/test_sgd_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca03cc11abe2ceb31b33a87797aa752943dd2a7d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
@@ -0,0 +1,18 @@
+import unittest
+import numpy
+from op_test_util import OpTestMeta
+
+
+class TestSGD(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "sgd"
+        self.param = numpy.random.random((102, 105)).astype("float32")
+        self.grad = numpy.random.random((102, 105)).astype("float32")
+        self.learning_rate = 0.1
+        self.param_out = self.param - self.learning_rate * self.grad
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_sigmoid_op.py b/python/paddle/v2/framework/tests/test_sigmoid_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..50044a122f1d66dd54a24f6cce76074a60ee2262
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
@@ -0,0 +1,16 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestSigmoidOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "sigmoid"
+        self.X = np.random.random((32, 100)).astype("float32")
+        self.Y = 1 / (1 + np.exp(-self.X))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_softmax_op.py b/python/paddle/v2/framework/tests/test_softmax_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..191b698c1cdec9b86b4ded6b1f743586867ca62f
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
@@ -0,0 +1,23 @@
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSoftmaxOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "softmax"
+        self.X = np.random.random((32, 100)).astype("float32")
+        self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..1af39818a305215b45219b8c5f0a10630fd64279
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -0,0 +1,48 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+
+
+class TestScope(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int(place)
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array, place)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1.0, tensor_array_2[3, 9])
+        self.assertEqual(2.0, tensor_array_2[19, 11])
+
+    def test_float_tensor(self):
+        scope = core.Scope()
+        var = scope.new_var("test_tensor")
+        place = core.CPUPlace()
+
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float(place)
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array, place)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 34b7308601390a4ccb0c19ef10d2c7a60b3fa576..4dcc3ab57e7e6dfbe040ac61025e55b9e48b4415 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,9 +1,7 @@
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder
 
 __all__ = ['infer', 'Inference']
 
@@ -28,6 +26,7 @@ class Inference(object):
     """
 
     def __init__(self, output_layer, parameters):
+        import py_paddle.swig_paddle as api
         topo = topology.Topology(output_layer)
         gm = api.GradientMachine.createFromConfigProto(
             topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
@@ -36,10 +35,18 @@ class Inference(object):
             name = param.getName()
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
+            # the setValueUpdated function is called in randomize, zeroMem,
+            # load function in paddle/parameter/Parameter.cpp. But in the
+            # inference mode, the setValueUpdated is never called, it will
+            # cause the parameter will not be dispatched
+            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
+            # called here, but it's better to call this function in one place.
+            param.setValueUpdated()
         self.__gradient_machine__ = gm
         self.__data_types__ = topo.data_type()
 
     def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         batch_size = len(input)
 
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 4ade1c6f329ae39769279963af6809f938807bdd..6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -324,6 +324,3 @@ def parse_network(output_layers, extra_layers=None):
 
 def get_layer(name):
     return config_base.__layer_map__.get(name)
-
-
-cp.begin_parse()
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 70f9e43c9683033233d48a750668771a4c7ba045..fc718f031e2267e737adbc340226e145bf614bf2 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -1,8 +1,15 @@
 import ctypes
 import os
 
-path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
-lib = ctypes.cdll.LoadLibrary(path)
+__lib__ = None
+
+
+def get_c_lib():
+    global __lib__
+    if __lib__ is None:
+        path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+        __lib__ = ctypes.cdll.LoadLibrary(path)
+    return __lib__
 
 
 class client(object):
@@ -10,29 +17,53 @@ class client(object):
     client is a client to the master server.
     """
 
-    def __init__(self, addr, buf_size):
-        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
+        self.c = get_c_lib().paddle_new_etcd_master_client(
+            etcd_endpoints, timeout_sec, buf_size)
+
+    def request_save_model(self, trainer_id, block_ms):
+        """request to save model
+
+        Conventionally the 0-th trainer will save model. But in
+        distributed training, any trainer could be killed. This
+        function asks the master server if the trainer should proceed
+        with saving model.
+
+        :param trainer_id: trainer id.
+        :param block_ms: number of millisecond that other save model
+        will be blocked if this save model request succeeded.
 
-    def close(self):
-        lib.paddle_release_master_client(self.c)
+        Returns:
+            int: 1 if the save the model request is approved, 0 if
+            does the request is rejected because other trainer is
+            saving the model, -1 if error happened.
+
+        """
+        return get_c_lib().paddle_request_save_model(self.c, trainer_id,
+                                                     block_ms)
+
+    def release(self):
+        get_c_lib().paddle_release_master_client(self.c)
         self.c = None
 
     def set_dataset(self, paths):
         holder_type = ctypes.c_char_p * len(paths)
         holder = holder_type()
-        print paths
         for idx, path in enumerate(paths):
             c_ptr = ctypes.c_char_p(path)
             holder[idx] = c_ptr
-        lib.paddle_set_dataset(self.c, holder, len(paths))
+        get_c_lib().paddle_set_dataset(self.c, holder, len(paths))
 
-    # return format: (record, errno)
-    # errno =  0: ok
-    #       <  0: error
     def next_record(self):
+        """gets next record for training
+
+        Returns:
+            string: the record.
+            int: error code, 0 if successful, < 0 otherwise.
+        """
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
-        size = lib.paddle_next_record(self.c, ret)
+        size = get_c_lib().paddle_next_record(self.c, ret)
         if size < 0:
             # Error
             return None, size
@@ -43,5 +74,8 @@ class client(object):
 
         record = ret.contents.value[:size]
         # Memory created from C should be freed.
-        lib.mem_free(ret.contents)
+        get_c_lib().mem_free(ret.contents)
         return record, 0
+
+    def paddle_start_get_records(self, pass_id):
+        get_c_lib().paddle_start_get_records(self.c, pass_id)
diff --git a/python/paddle/v2/model.py b/python/paddle/v2/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..20c3282098785aaa5df86196c7c68f43d8c5d275
--- /dev/null
+++ b/python/paddle/v2/model.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import errno
+import uuid
+
+import paddle.v2.master
+
+__all__ = ["save_model", "load_model"]
+
+trainer_id = str(uuid.uuid4())
+
+
+def mkdir_p(path):
+    try:
+        os.makedirs(path)
+    except OSError as exc:
+        if exc.errno == errno.EEXIST and os.path.isdir(path):
+            pass
+        else:
+            raise
+
+
+def save_model(parameters, path):
+    need_request = "KUBERNETES_SERVICE_HOST" in os.environ.keys()
+
+    if need_request:
+        # TODO(helin): figure out how MPI trains, since MPI only save
+        # model when trainer_id == "0", we can consolidate the logic
+        # here.
+
+        # TODO(helin): change this environment variable name from
+        # MASTER_IP to ETCD_IP
+        etcd_name = "MASTER_IP"
+        if etcd_name not in os.environ.keys():
+            raise Exception('not find ' + etcd_name +
+                            ' in environment variable.')
+
+        etcd_ip = os.environ.get(etcd_name)
+        client = master.client("http://" + etcd_ip + ":2379", 5, 0)
+        r = client.request_save_model(trainer_id, 5000)
+        if r == 0:
+            # do not need to save
+            return
+        elif r < 0:
+            # error
+            return
+        else:
+            # save model
+            path = os.path.join(path, trainer_id)
+            path = os.path.join(path, "model.tar")
+
+    mkdir_p(path)
+
+    with open(path, 'wb') as f:
+        parameters.to_tar(f)
+
+
+def load_model(parameters, path):
+    with open(path, 'rb') as f:
+        parameters.from_tar(f)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 8124e219ba499333ecdf4b34ff5352e281aaa016..ba581980334fec6226a537af2cf53b3465d32c1e 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,5 +1,3 @@
-import py_paddle.swig_paddle as swig_api
-
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -18,6 +16,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -36,23 +35,27 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
     def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
     def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def __create_new_remote_updater__(self, pserver_spec):
+    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec)
+            self.__opt_conf__, pserver_spec, use_etcd)
 
     def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec):
+                       pserver_spec, use_etcd):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -66,6 +69,8 @@ class Optimizer(object):
             if use_sparse_remote_updater:
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
+
+        :param pserver_spec: pserver location, eg: localhost:3000
         :return: parameter_updater
         """
         if is_local:
@@ -76,7 +81,7 @@ class Optimizer(object):
                     num_passes, use_sparse_updater)
             else:
                 parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec)
+                    pserver_spec, use_etcd)
         return parameter_updater
 
 
@@ -266,6 +271,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization
 
 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
     swig_api.initPaddle('--use_gpu=false')
     for opt in [
             Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bbaf8bfa979fbbf460561ebf7077b75b9c41a11a..a9cba8ca0b1efd4149463f6c7bf2dcdfbea350c9 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,5 +1,4 @@
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
 import struct
@@ -124,6 +123,7 @@ class Parameters(object):
         :return: parameter value
         :rtype: np.ndarray
         """
+        import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
         if len(self.__gradient_machines__) == 0:
@@ -223,7 +223,7 @@ class Parameters(object):
         :type gradient_machine: api.GradientMachine
         :return:
         """
-
+        import py_paddle.swig_paddle as api
         if not isinstance(gradient_machine, api.GradientMachine):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
@@ -359,6 +359,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
     :return:
     :rtype: api.Parameter
     """
+    import py_paddle.swig_paddle as api
     param = __get_parameter_in_gradient_machine__(gradient_machine, name)
     vec = param.getBuf(api.PARAMETER_VALUE)
     assert isinstance(vec, api.Vector)
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 61b5cc134fba875955bdbfddc2bb1e083241940d..d0f18e4b6611fa56654e7f2a0144758339cb9e19 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -12,11 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Creator package contains some simple reader creator, which could be used in user
-program.
+Creator package contains some simple reader creator, which could
+be used in user program.
 """
 
-__all__ = ['np_array', 'text_file', "recordio"]
+__all__ = ['np_array', 'text_file', "cloud_reader"]
 
 
 def np_array(x):
@@ -59,7 +59,7 @@ def text_file(path):
 
 def recordio_local(paths, buf_size=100):
     """
-    Creates a data reader from given RecordIO file paths separated by ",", 
+    Creates a data reader from given RecordIO file paths separated by ",",
         glob pattern is supported.
     :path: path of recordio files.
     :returns: data reader of recordio files.
@@ -81,35 +81,41 @@ def recordio_local(paths, buf_size=100):
     return dec.buffered(reader, buf_size)
 
 
-def recordio(paths, buf_size=100):
+pass_num = 0
+
+
+def cloud_reader(paths, etcd_endpoints, timeout_sec=5, buf_size=64):
     """
-    Creates a data reader that outputs record one one by one 
-        from given local or cloud recordio path.
+    Create a data reader that yield a record one by one from
+        the paths:
     :path: path of recordio files.
+    :etcd_endpoints: the endpoints for etcd cluster
     :returns: data reader of recordio files.
+
+    ..  code-block:: python
+        from paddle.v2.reader.creator import cloud_reader
+        etcd_endpoints = "http://127.0.0.1:2379"
+        trainer.train.(
+            reader=cloud_reader(["/work/dataset/uci_housing/uci_housing*"], etcd_endpoints),
+        )
     """
     import os
-    import paddle.v2.master.client as cloud
-
-    if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
-        return recordio_local(paths)
-
-    host_name = "MASTER_SERVICE_HOST"
-    if host_name not in os.environ.keys():
-        raise Exception('not find ' + host_name + ' in environ.')
-
-    addr = os.environ(host)
+    import cPickle as pickle
+    import paddle.v2.master as master
+    c = master.client(etcd_endpoints, timeout_sec, buf_size)
+    c.set_dataset(paths)
 
     def reader():
-        c = cloud(addr, buf_size)
-        c.set_dataset(paths)
+        global pass_num
+        c.paddle_start_get_records(pass_num)
+        pass_num += 1
 
         while True:
-            r, err = client.next_record()
-            if err < 0:
+            r, e = c.next_record()
+            if not r:
+                if e != -2:
+                    print "get record error: ", e
                 break
-            yield r
-
-        c.close()
+            yield pickle.loads(r)
 
     return reader
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index b42d273ecfe6c4bc5706ec52617960b83496d70d..359f3eeefbe8efeb343cc875c707c9251a7087fb 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -34,14 +34,5 @@ class TestTextFile(unittest.TestCase):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
-class TestRecordIO(unittest.TestCase):
-    def test_recordio(self):
-        path = os.path.join(
-            os.path.dirname(__file__), "test_recordio_creator.dat")
-        reader = paddle.v2.reader.creator.recordio([path])
-        for idx, r in enumerate(reader()):
-            self.assertSequenceEqual(r, str(idx))
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index f9658a8c5df9562073c8a187074a6cb3459ac5d9..76bae0bb12b6c33f88530386f9cc19ae9b59f457 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,12 +2,6 @@
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -41,6 +35,7 @@ class SGD(object):
     :type parameters: paddle.v2.parameters.Parameters
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
+    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
     """
 
@@ -50,7 +45,8 @@ class SGD(object):
                  update_equation,
                  extra_layers=None,
                  is_local=True,
-                 pserver_spec=None):
+                 pserver_spec=None,
+                 use_etcd=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -58,6 +54,7 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
         self.__optimizer__ = update_equation
         self.__topology__ = topology
@@ -65,6 +62,7 @@ class SGD(object):
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
         self.__pserver_spec__ = pserver_spec
+        self.__use_etcd__ = use_etcd
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -123,13 +121,15 @@ class SGD(object):
         :type feeding: dict|list
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         if event_handler is None:
             event_handler = default_event_handler
         __check_train_args__(**locals())
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
             self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__)
+            self.__pserver_spec__, self.__use_etcd__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
@@ -186,6 +186,8 @@ class SGD(object):
         :type feeding: dict
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/setup.py.in b/python/setup.py.in
index a422b3832f4c9c60bc5406277f9ada7032f85f51..65a26940d4d703ea4fbb5022523a90716982ec10 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -19,7 +19,9 @@ setup_requires=["requests",
                 "recordio",
                 "matplotlib",
                 "rarfile",
-                "scipy>=0.19.0"]
+                "scipy>=0.19.0",
+                "Pillow",
+                "nltk"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]
@@ -29,7 +31,9 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'],
+            'paddle.v2.framework': ['core.so']
+      },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.