diff --git a/.gitignore b/.gitignore
index 275173b9677bffe028152fe8eadb3384329aeb5a..c84b2fc8c79d6e2c9c83e2b830ab176295846fd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,9 @@ third_party/
 
 # clion workspace.
 cmake-build-*
+
+# generated while compiling
+python/paddle/v2/framework/core.so
+CMakeFiles
+cmake_install.cmake
+
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 61b989dc698798eca932516e558c63f107ef2754..efb4dcb2dfbc63bb6905961b054cdef860cf4573 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,10 @@
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
     hooks:
     -   id: clang-formater
--   repo: https://github.com/dnephin/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
     hooks:
       -   id: go-fmt
-          files: (.*\.go)
-      -   id: go-lint
-          files: (.*\.go)
+          types: [go]
+      -   id: gometalinter
+          types: [go]
diff --git a/.travis.yml b/.travis.yml
index 498674469b27f585af798b95f30b74ebed99e32c..376c693602b56fe719decfeb41c217497e143e12 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ cache:
     - $HOME/.ccache
     - $HOME/.cache/pip
     - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@@ -11,6 +12,7 @@ os:
 env:
   - JOB=build_doc
   - JOB=check_style
+  - JOB=build_android
 addons:
   apt:
     packages:
@@ -39,6 +41,8 @@ before_install:
   - pip install rarfile
   - curl https://glide.sh/get | bash
   - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fdc62b31511c424b2944d05be46d029a6d4bfc8b..9a852248432e51012274273d9f01e0eef90ef41f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,7 +28,9 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-find_package(Boost QUIET)
+if(NOT ANDROID)
+    find_package(Boost QUIET)
+endif()
 
 include(simd)
 
@@ -97,6 +99,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
+include(external/pybind11)    # download pybind11
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
@@ -134,11 +137,16 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
+# "add_subdirectory(go)" should be placed after the following loine,
+# because it depends on paddle/optimizer.
+add_subdirectory(paddle/optimizer)
+
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
@@ -146,7 +154,9 @@ if(WITH_GOLANG)
 endif(WITH_GOLANG)
 
 add_subdirectory(paddle)
-add_subdirectory(python)
+if(WITH_PYTHON)
+  add_subdirectory(python)
+endif()
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/Dockerfile b/Dockerfile
index ed5910d93b41dba8d50b2ba01c59c635797edd29..8cfb16928c95dcbfac08383d32562ff67933d873 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \
diff --git a/Dockerfile.android b/Dockerfile.android
index fa24f6f06c4e76444c83bcf13fe312afdcb6c348..c0fa58c384f9ebcae60477ffce49ea4ffa929db9 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
     wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
     apt-get clean -y
 
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
diff --git a/README.md b/README.md
index fa16cc3cf2ef9c1200a19e03192c94c65fc08679..2a6beeb342b34f8e91ef509d7d41f286a666480c 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -61,35 +61,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
+[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
 
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
 
   You might want to start from the this online interactive book that can run in Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
+
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index a4f98ec7d4af652d0dd0650f4906696ff3a4efb9..7afab5d5344b704a9329e313a81379032ba0cc97 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -102,12 +102,19 @@ if(WITH_GOLANG)
       message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
     endif()
 
-    add_custom_target(go_vendor)
-    add_custom_command(TARGET go_vendor
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
       COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
       WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
-    add_dependencies(go_vendor go_path)
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
   endif()
 
 endif(WITH_GOLANG)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 6bbcd730e1b5ac49415cac676352e6df00eb6eb5..656e1a0803c6e389d70f37f592c3aa2e95a2bcd4 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
     .*cblas\\.h.*
     .*\\.pb\\.txt
     .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 
 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                 endif()
             endforeach()
             if(LINT MATCHES ON)
+                # cpplint code style
                 get_filename_component(base_filename ${filename} NAME)
                 set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
             endif()
         endforeach()
diff --git a/cmake/cross_compiling/android.cmake b/cmake/cross_compiling/android.cmake
index 9724c16122ab2e6be55864c8716698c9b9d7c3f0..5e3e437a8da9624df35a5c754fe77be73f20361d 100644
--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -106,6 +106,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
                 SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
             ENDIF()
         ENDIF()
+        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+        ENDIF()
         SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
     ENDIF()
 
@@ -162,6 +166,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
         ENDIF()
     ENDIF()
 
+    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+    ENDIF()
+
     STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
     STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")
 
@@ -186,6 +194,10 @@ ELSE()
         SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
     ENDIF()
     SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bd401faa6eb8a583bce542db68852f8571681daf..8a594a825abdca6a0f989b94fa42f97d6df5e10a 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -52,6 +52,7 @@ ExternalProject_Add(
 
 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog)
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog gflags)
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/paddle/function/nnpack/nnpack.cmake b/cmake/external/nnpack.cmake
similarity index 54%
rename from paddle/function/nnpack/nnpack.cmake
rename to cmake/external/nnpack.cmake
index 7182730ae8f133bdc4f73bfc46fa8acbe5f3b603..d42bcb0f329041462bd8b568052fbb8226d18e4e 100644
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/cmake/external/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
   set(NNPACK_FOUND ON)
   INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
   message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 5b9d9844ed21ceb507a8e01676c3533f4e3dd8fb..60a1041936437775e0994157b8ffcb7c52b7ab87 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND})
             # arm_soft_fp_abi branch of OpenBLAS to support softfp
             #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
             SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                SET(TARGET "ARMV7")
+            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                SET(TARGET "ARMV8")
+            ENDIF()
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
         ELSEIF(RPI)
             # use hardfp
             SET(OPENBLAS_COMMIT "v0.2.19")
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9391c285c7544669a5b1a078b7473d7a656c1bb4
--- /dev/null
+++ b/cmake/external/pybind11.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
+
+LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 6546b2c83bc8f81f89e4018a2216f191bbeb0d21..67a359d4b5f4cca8fc8e74eab4d4acb4cc12baed 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,6 +18,9 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
 IF(WITH_PYTHON)
     FIND_PACKAGE(PythonLibs 2.7)
+    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 ENDIF(WITH_PYTHON)
 
 SET(py_env "")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7a996dea92b13bdac054a987a004a3d54ff02da2..c31e62fc08b531a38a851b71a033e14277eff015 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -109,7 +109,9 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in Pybind11
+)
 
 set(GPU_COMMON_FLAGS
     -fPIC
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 725cf28037182bfd0a27d49491bc8a479a9b4440..e42e75c12ab1e5133f5ecbdb90ef26e3f8df5133 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -90,10 +90,11 @@
 # including binary directory for generated headers.
 include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
-if(NOT APPLE)
+if(NOT APPLE AND NOT ANDROID)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-endif(NOT APPLE)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+endif(NOT APPLE AND NOT ANDROID)
 
 function(merge_static_libs TARGET_NAME)
   set(libs ${ARGN})
@@ -103,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
   foreach(lib ${libs})
     list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
   endforeach()
+  list(REMOVE_DUPLICATES libs_deps)
 
   if(APPLE) # Use OSX's libtool to merge archives
     # To produce a library we need at least one source file.
@@ -126,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
       # Get the file names of the libraries to be merged
       set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
     endforeach()
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
       COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
       COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
   else() # general UNIX: use "ar" to extract objects and re-add to a common lib
@@ -144,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		
 
       list(APPEND mergebases "${mergebase}")
     endforeach()
@@ -183,6 +185,10 @@ function(cc_library TARGET_NAME)
       add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
       target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
     endif()
+    
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+
   else(cc_library_SRCS)
     if (cc_library_DEPS)
       merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -300,7 +306,7 @@ function(go_library TARGET_NAME)
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-  # FIXME: link path
+
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
@@ -308,7 +314,7 @@ function(go_library TARGET_NAME)
     -o "${${TARGET_NAME}_LIB_PATH}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   add_dependencies(${TARGET_NAME} go_vendor)
 endfunction(go_library)
 
@@ -319,14 +325,11 @@ function(go_binary TARGET_NAME)
   cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
-  # FIXME: link path
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
-      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  # TODO: don't know what ${TARGET_NAME}_link does
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
@@ -334,15 +337,18 @@ endfunction(go_binary)
 function(go_test TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_test_SRCS}
+    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_test(NAME ${TARGET_NAME}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
-  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
 function(proto_library TARGET_NAME)
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 4f4a9187bcbe8ef902e923622552909808b121d6..daee55b7f9adfffdf709ed2b5b0d957c7ca1aea4 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -474,6 +474,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
     :noindex:
 
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
 Detection output Layer
 ======================
 
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
index 9489a921c70ad6ee5709f46445554f5d9640162c..75037e693b32f923ee7dc9dfec322495fe4ce10a 100644
--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-假设 :math:`z = f(W^T x + b)` ，那么
+假设 :math:`z = W^T x + b` ，那么
 
 .. math::
 
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It
 
 where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
 
-The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. 
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
 
 Suppose our loss function is :math:`c(y)`, then
 
@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then
 
 .. math::
 
@@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
 Then, for fully connected layer, we need to compute:
 
 .. math::
-  
+
    \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
 
 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
                       /* weight */ true);
       }
     }
-    
+
 If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index d536f53abc031e9d279ace0e231a381a2f1e81b6..36e5d420c986fc8d88eefee4aa221dba0a0480f2 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使
 
     python -c "import py_paddle"
 
-如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
 
 如果提示正确，可以执行以下命令编译生成文档，即
@@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org文档
 ================================
 
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
 
 
 
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
index f00c70a0589a4f41a23164a95d505d4310d9157b..29ce909c6442014fa0b64c6ca018a61b92c840e9 100644
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
@@ -17,3 +17,7 @@ add_subdirectory(pserver/client/c)
 add_subdirectory(cmd/pserver)
 add_subdirectory(cmd/master)
 add_subdirectory(master/c)
+add_subdirectory(master)
+add_subdirectory(pserver)
+add_subdirectory(pserver/client)
+add_subdirectory(utils/networkhelper)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
index 1058ffa86b3f00b5e9525edca39a843da9b62db8..9e149967e71c9439bb00b973aa8723a809604aaf 100644
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(master SRC master.go DEPS paddle_go_optimizer)
+go_binary(master SRC master.go)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 54fa254863156455f66fa87de9077042a45f9735..9eaf8c04ae01fe7eebc92c51803bfcf977995ee3 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -11,6 +11,7 @@ import (
 
 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +21,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+
+	log.SetLevel(level)
+
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 0ecb1242c3c3d5246125c9ce946001ccf9cbec24..652d7ba315d72ff19931b82a4b0d1c30b2ff8f37 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	"github.com/namsral/flag"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
@@ -18,53 +19,47 @@ func main() {
 	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
 	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
-	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
+	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
 	level, err := log.ParseLevel(*logLevel)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	log.SetLevel(level)
 
 	var idx int
+
 	var cp pserver.Checkpoint
 	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
-		timeout := time.Second * time.Duration((*etcdTimeout))
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
-		idx, err = e.Register()
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
+		idx, err = e.Register(*port)
+		candy.Must(err)
+
+		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
 		if err != nil {
-			panic(err)
+			log.Errorf("Fetch checkpoint failed, %s", err)
 		}
 	}
 
 	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	err = rpc.Register(s)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
-
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 }
diff --git a/go/glide.lock b/go/glide.lock
index 190a222338b24b7edac76c72d07df0b2cbd7d9be..f71ae643d68d29846611ec52d0ae7d67e4ced850 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,8 +1,8 @@
-hash: b8f18ce6784bd3fadd9fed0b8443e7b658234ea785ae1f220723ae2c1f652aa7
-updated: 2017-06-27T14:05:48.925262819+08:00
+hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
+updated: 2017-07-11T10:04:40.786745417+08:00
 imports:
 - name: github.com/coreos/etcd
-  version: 61fc123e7a8b14a0a258aa3f5c4159861b1ec2e7
+  version: cb2a496c4ddd1c87a9f280e116649b599999ec79
   subpackages:
   - auth/authpb
   - clientv3
@@ -22,7 +22,9 @@ imports:
 - name: github.com/PaddlePaddle/recordio
   version: edfb82af0739c84f241c87390ec5649c7b28c129
 - name: github.com/sirupsen/logrus
-  version: 202f25545ea4cf9b191ff7f846df5d87c9382c2b
+  version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -34,11 +36,11 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: f7928cfef4d09d1b080aa2b6fd3ca9ba1567c733
+  version: abf9c25f54453410d0c6668e519582a9e1115027
   subpackages:
   - unix
 - name: golang.org/x/text
-  version: 4e9ab9ee170f2a39bd66c92b3e0a47ff47a4bc77
+  version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
   subpackages:
   - secure/bidirule
   - transform
diff --git a/go/glide.yaml b/go/glide.yaml
index 05c5d15ca22b6a3d85bee8e1f31d222034ce5314..ab472c7cda9755d0399bb8376b16589be8b53057 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -10,3 +10,4 @@ import:
   version: ^1.7.4-pre
 - package: github.com/sirupsen/logrus
   version: ^1.0.0
+- package: github.com/topicai/candy
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30531e6469297be1624ea590ea71b1c996b58ed4
--- /dev/null
+++ b/go/master/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(master_test)
+endif()
diff --git a/go/master/c/client.go b/go/master/c/client.go
index 31f431197454c2ec6a25624d37b60876d99f0087..2cbe164c7b406b189f44ec850796203f24779205 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -114,13 +113,13 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}
 
diff --git a/go/master/client.go b/go/master/client.go
index 05383f1bf40c0e2b1f974e802ee8fd6aac109b00..90b99470978d21480eb2d8097e7dec217b9524eb 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,6 +2,7 @@ package master
 
 import (
 	"os"
+	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
@@ -36,9 +37,9 @@ func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// TODO(helin): wait before move on with next
 			// getTask call.
-			log.Errorln(err)
+			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
+			time.Sleep(3 * time.Second)
 			continue
 		}
 
@@ -68,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }
 
@@ -118,6 +122,11 @@ func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
 
+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFailed", meta, nil)
+}
+
 // NextRecord returns next record in the dataset.
 //
 // NextRecord will block until the next record is available. It is
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 364dce7b58cf6366af711bde9107559a762563a4..70dc09bf9461142ff6498355a5858ba9a1320510 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {
 
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
@@ -95,10 +109,16 @@ func TestGetFinishTask(t *testing.T) {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].ID)
+		err = c.taskFinished(tasks[0].Meta.ID)
 		if err != nil {
 			t.Fatalf("Error: %v, pass: %d\n", err, i)
 		}
+
+		err = c.taskFailed(tasks[0].Meta)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+
 		tasks = tasks[1:]
 		task, err := c.getTask()
 		if err != nil {
@@ -107,7 +127,7 @@ func TestGetFinishTask(t *testing.T) {
 		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.ID)
+			err = c.taskFinished(task.Meta.ID)
 			if err != nil {
 				t.Fatalf("Error: %v, pass: %d\n", err, i)
 			}
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 6666d3860c412daa8711fbfa2d729a261b3fd887..bc92dc5ac973d62434b71e09705143ac8fbbd2fa 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,14 +57,30 @@ func TestNextRecord(t *testing.T) {
 
 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
+
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index 04c1394e963d1eb541b80b91407fb55b0d1e1f2a..69dc6a8268748ad9a72eb10fc2789982f565d291 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)
 
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
index bcd549b20e46381783bad11caa08cb7f4ba40add..57e75dc4e01b4bafa8153bcc7fbc82a9eb2b08f5 100644
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"
 
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte
diff --git a/go/master/service.go b/go/master/service.go
index 58e68e744859933aa74cac231356d4ff9dfb8d7b..262735f421ad7ae04050e9264a177ee4c46e68d0 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -31,30 +31,36 @@ type Chunk struct {
 	Index recordio.Index // chunk index
 }
 
+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
+
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
-	ID     int
+	Meta   TaskMeta
 	Chunks []Chunk
 }
 
 type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
+	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
 }
 
 type taskQueues struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
-	Failed  []Task
+	Failed  []taskEntry
 }
 
 // Service is the master server service.
 type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
-	timeoutMax    int
+	failureMax    int
 	ready         chan struct{}
 	store         Store
 
@@ -73,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
+			cur.Task.Meta.ID = id
 			id++
 			result = append(result, cur)
 			cur.Task.Chunks = nil
@@ -83,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}
 
 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}
 
@@ -91,11 +97,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
+	s.failureMax = failureMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -154,7 +160,7 @@ func (s *Service) recover() (bool, error) {
 
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -209,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 
 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -257,6 +264,33 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }
 
+func (s *Service) processFailedTask(t taskEntry, epoch int) {
+	if t.Task.Meta.Epoch != epoch {
+		// new epoch, task launched after the
+		// schedule of this timeout check or failed status report.
+		return
+	}
+
+	defer func() {
+		err := s.snapshot()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}()
+
+	delete(s.taskQueues.Pending, t.Task.Meta.ID)
+
+	t.NumFailure++
+	if t.NumFailure > s.failureMax {
+		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		return
+	}
+
+	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+}
+
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 	return func() {
 		s.mu.Lock()
@@ -267,30 +301,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}
 
-		if t.Epoch != epoch {
-			// new epoch, task launched after the
-			// schedule of this timeout check.
-			return
-		}
-
-		defer func() {
-			err := s.snapshot()
-			if err != nil {
-				log.Errorln(err)
-			}
-		}()
-
-		delete(s.taskQueues.Pending, t.Task.ID)
-
-		t.NumTimeout++
-		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
-			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-			return
-		}
-
-		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
-		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+		s.processFailedTask(t, epoch)
 	}
 }
 
@@ -339,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 
 	t := s.taskQueues.Todo[0]
-	t.Epoch++
+	t.Task.Meta.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
+	s.taskQueues.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
 
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
 
@@ -365,13 +376,12 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
-		return err
+		return nil
 	}
 
 	// task finished, reset timeout
-	t.NumTimeout = 0
+	t.NumFailure = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
 
@@ -389,3 +399,22 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 	return err
 }
+
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.taskQueues.Pending[meta.ID]
+	if !ok {
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
+		return nil
+	}
+
+	s.processFailedTask(t, meta.Epoch)
+	return nil
+}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index bc435b505c014ca13ed5fc16b33a21ebb089a3b7..9c0d1d0a39fc8cb2b29fd0e3a4ba0c9b255f80fb 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		if ts[i].Task.Meta.ID != i {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6267040a6eb421ef5006a83625cf24a8124f5320
--- /dev/null
+++ b/go/pserver/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(pserver_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0052bb460bbe3a8fc1e898cac8c3d42caec098a7
--- /dev/null
+++ b/go/pserver/client/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(pserver_client_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4bf05c85386dfcef83453a663dffc5d62efcbcc0
--- /dev/null
+++ b/go/pserver/client/c/.gitignore
@@ -0,0 +1 @@
+libpaddle_go_optimizer.a
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index 93a0a27f858f8654e0a6114abe7e326b086b8bf9..c6333eab550c9a2b71bcaf20b69b2bc0a9b9c529 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,13 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 target_link_libraries(paddle_go_optimizer stdc++ m)
+
+# Copy library to the required place.
+# See: go/pserver/optimizer.go:
+# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+add_custom_command(TARGET paddle_go_optimizer POST_BUILD
+  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
+  )
+
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
   # FIXME: this test requires pserver which is not managed by the test
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 7ddaceb7ed33db32e19a191402100a0c0efa241a..718b4304c80791b4d8a8816f256c8fa93e0b1ead 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -34,7 +34,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -63,7 +62,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -101,11 +100,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
-	etcd_client := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }
 
@@ -124,20 +123,20 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
 
@@ -223,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
 
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 68e1d9b269209b695e27f91a656dc2d8e527b4cd..e9264592b4f18fddf68b198d73bf907206e77a3f 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -1,5 +1,23 @@
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoint = "http://" + etcd_ip + ":2379"
+
+
+def cloud_reader():
+    print "connecting to master, etcd endpoints: ", etcd_endpoint
+    master_client = master.client(etcd_endpoint, 5, 64)
+    master_client.set_dataset(
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+    while 1:
+        r, e = master_client.next_record()
+        if not r:
+            break
+        yield pickle.loads(r)
 
 
 def main():
@@ -19,16 +37,16 @@ def main():
     # create parameters
     parameters = paddle.parameters.create(cost)
 
-    # create optimizer
+    # create optimizer of new remote updater to pserver
     optimizer = paddle.optimizer.Momentum(momentum=0)
 
-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
-
+    print "etcd endoint: ", etcd_endpoint
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer,
                                  is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoint,
+                                 use_etcd=True)
 
     # event_handler to print training and testing info
     def event_handler(event):
@@ -47,11 +65,11 @@ def main():
                 print "Test %d, %.2f" % (event.pass_id, result.cost)
 
     # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
     trainer.train(
         reader=paddle.batch(
             paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
-            batch_size=2),
+                cloud_reader, buf_size=500), batch_size=2),
         feeding={'x': 0,
                  'y': 1},
         event_handler=event_handler,
diff --git a/go/pserver/client/client.go b/go/pserver/client/client.go
index aa8bfe30c26fcc0875ad479ecd562700ccefa5a3..b4a45e1c21056550ef9264746bcf58a8abb369a1 100644
--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -233,7 +233,7 @@ func (c *Client) Save(path string) error {
 
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }
 
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 29b400812c9dc3a5f44700eacbf7ba043248f2f2..5c89882a297323034be2875a6d4cb71d715eb0c2 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -3,11 +3,13 @@ package client_test
 import (
 	"context"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/rpc"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -42,7 +44,8 @@ func initClient() [numPserver]int {
 		ports[i] = p
 
 		go func(l net.Listener) {
-			s, err := pserver.NewService(0)
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, 1, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
@@ -76,15 +79,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 
 type selector bool
@@ -99,27 +120,34 @@ func (l lister) List() []client.Server {
 	return l
 }
 
-func ClientTest(t *testing.T, c *client.Client) {
+func testClient(t *testing.T, c *client.Client) {
 	selected := c.BeginInitParams()
 	if !selected {
 		t.Fatal("should be selected.")
 	}
 
-	const numParameter = 100
+	const numParameter = 1000
 	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
+
+	var wg sync.WaitGroup
 	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
-		p.Name = "p_" + strconv.Itoa(i)
-		p.ElementType = pserver.Float32
-		p.Content = make([]byte, (i+1)*100)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-		if err != nil {
-			t.Fatal(err)
-		}
+		wg.Add(1)
+		go func(i int) {
+			var p pserver.Parameter
+			p.Name = "p_" + strconv.Itoa(i)
+			p.ElementType = pserver.Float32
+			p.Content = make([]byte, (i+1)*100)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
 	}
+	wg.Wait()
 
 	err = c.FinishInitParams()
 	if err != nil {
@@ -127,7 +155,7 @@ func ClientTest(t *testing.T, c *client.Client) {
 	}
 
 	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
+	for i := 0; i < numParameter; i++ {
 		var g pserver.Gradient
 		g.Name = "p_" + strconv.Itoa(i)
 		g.ElementType = pserver.Float32
@@ -135,9 +163,31 @@ func ClientTest(t *testing.T, c *client.Client) {
 		grads = append(grads, g)
 	}
 
-	err = c.SendGrads(grads)
-	if err != nil {
-		t.Fatal(err)
+	const paramPerGroup = 10
+	const numGroups = numParameter / paramPerGroup
+
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err := c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
 	}
 
 	names := make([]string, numParameter)
@@ -145,20 +195,35 @@ func ClientTest(t *testing.T, c *client.Client) {
 		names[i] = "p_" + strconv.Itoa(i)
 	}
 
-	params, err := c.GetParams(names)
-	if err != nil {
-		t.Fatal(err)
-	}
+	for i := 0; i < numGroups; i++ {
+		var ns []string
+		if i == numGroups-1 {
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
 
-	if len(names) != len(params) {
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-	}
+		wg.Add(1)
+		go func(ns []string) {
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
 
-	for i := range params {
-		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
-		}
+			if len(ns) != len(params) {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+			}
+
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
 	}
+
+	wg.Wait()
 }
 
 func TestNativeClient(t *testing.T) {
@@ -168,13 +233,14 @@ func TestNativeClient(t *testing.T) {
 		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
 	}
 	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	ClientTest(t, c1)
+	testClient(t, c1)
 }
 
-// TODO: tmperary disable etcdClient test for dependency of etcd)
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
-	etcd_client := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
-	ClientTest(t, c2)
+	etcdClient := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
+	testClient(t, c2)
 }
diff --git a/go/pserver/client/etcd_client.go b/go/pserver/client/etcd_client.go
index 1fd3479aa88ccbbe7c5067da1e9886b65352e847..953065b427ed52d39f1253ea94d485b765ea5dc2 100644
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,7 +12,7 @@ import (
 )
 
 const (
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
 )
 
 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -47,7 +47,7 @@ func (p *EtcdClient) Desired() int {
 
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -106,11 +106,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -118,7 +118,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 1f77787150d16052e3588e9c1795c8d5dafa08e6..e70e826975b26db302a6799e9171cff970153aac 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -16,7 +16,7 @@ import (
 const (
 	// PsDesired is etcd path for store desired pserver count
 	PsDesired = "/ps_desired"
-	// PsAddr is the base dir for pserver to store their addr
+	// PsPath is the base dir for pserver to store their addr
 	PsPath = "/ps/"
 	// PsCheckpoint is the etcd path for store checkpoints information
 	PsCheckpoint = "/checkpoints/"
@@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
+func (e *EtcdClient) Register(port int) (int, error) {
 
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
@@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
 }
 
 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
@@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 					log.Fatal(err)
 				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
 				if kaerr != nil {
 					log.Errorf("keepalive etcd node error: %v", kaerr)
@@ -176,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 
 	if err != nil {
@@ -189,13 +190,26 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 	return idx, nil
 }
 
-// PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
-	_, err := e.etcdClient.Put(ctx, key, string(value))
+// GetKey gets the value by the specified key
+func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := e.etcdClient.Get(ctx, key)
 	cancel()
 	if err != nil {
-		return err
+		return []byte{}, err
 	}
-	return nil
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return []byte{}, nil
+	}
+	v := kvs[0].Value
+	return v, nil
+}
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	_, err := e.etcdClient.Put(ctx, key, string(value))
+	cancel()
+	return err
 }
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 2d7882d1a75ef55df4a1ec81a8606cd84334fa64..151a3f80332b0e62767586f9f769c839ba19ce1e 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,8 +1,7 @@
 package pserver
 
 // #cgo CFLAGS: -I ../../
-// //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
@@ -15,15 +14,14 @@ import (
 	log "github.com/sirupsen/logrus"
 )
 
-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
+	contentLen  int
 }
 
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
 
@@ -38,25 +36,28 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
 	s := State
+	paramBufferSize := C.size_t(len(p.Content))
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
-		"ParamSize":   len(p.Content),
+		"ParamSize":   paramBufferSize,
 		"ConfigSize":  len(c),
 		"StateSize":   len(s),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(C.size_t(len(p.Content)))
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+	cbuffer = C.malloc(paramBufferSize)
+
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
 	var cstate unsafe.Pointer
 	if len(s) != 0 {
 		cstate = unsafe.Pointer(&s[0])
 	}
 
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float), (*C.char)(cstate), C.int(len(s)))
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
 }
 
@@ -68,8 +69,8 @@ func (o *optimizer) GetWeights() []byte {
 
 func (o *optimizer) GetStates() []byte {
 	var cbuffer *C.char
-	cbuffer_len := C.paddle_optimizer_get_state(o.opt, &cbuffer)
-	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbuffer_len))
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
 }
 
 func (o *optimizer) UpdateParameter(g Gradient) error {
@@ -77,7 +78,11 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}
 
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
+	}
+
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
@@ -85,8 +90,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 6b52d0d896f8bc04fab6c9b68911523cbb7ac8b9..c723959d6b87524762e2f874bb5e4d5bd567cd00 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,6 +9,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -21,14 +22,14 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
+// RPC error message.
 const (
-	// AlreadyInitialized is true if pserver is initialized
-	AlreadyInitialized = "pserver already initialized"
-	// Uninitialized is true if pserver not fully initialized
-	Uninitialized = "pserver not fully initialized"
+	AlreadyInitialized  = "pserver already initialized"
+	Uninitialized       = "pserver not fully initialized"
+	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
 )
 
-// Supported element types
+// Supported element types.
 const (
 	Int32 ElementType = iota
 	UInt32
@@ -51,21 +52,15 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
-// ParameterCheckpoint is Parameter and State checkpoint
-type ParameterCheckpoint struct {
-	ParamConfig ParameterWithConfig
-	State       []byte
-}
-
-// checkpoint signature
+// checkpointMeta saves checkpoint metadata
 type checkpointMeta struct {
 	UUID      string `json:"uuid"`
-	Md5sum    string `json:"md5sum"`
-	Timestamp string `json:"timestamp"`
+	MD5       string `json:"md5"`
+	Timestamp int64  `json:"timestamp"`
 }
 
 // Checkpoint is the pserver shard persist in file
-type Checkpoint []ParameterCheckpoint
+type Checkpoint []parameterCheckpoint
 
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
@@ -81,12 +76,53 @@ type Service struct {
 	optMap             map[string]*optimizer
 }
 
+// parameterCheckpoint saves parameter checkpoint
+type parameterCheckpoint struct {
+	ParameterWithConfig
+	State []byte
+}
+
+// NewCheckpointFromFile loads parameters and state from checkpoint file
+func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) {
+	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+	if err != nil {
+		return nil, err
+	}
+
+	var cpMeta checkpointMeta
+	if err = json.Unmarshal(v, &cpMeta); err != nil {
+		return nil, err
+	}
+
+	fn := filepath.Join(cpPath, cpMeta.UUID)
+	if _, err = os.Stat(fn); os.IsNotExist(err) {
+		return nil, err
+	}
+	content, err := ioutil.ReadFile(fn)
+	if err != nil {
+		return nil, err
+	}
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(content))
+	if md5 != cpMeta.MD5 {
+		return nil, errors.New(CheckpointMD5Failed)
+	}
+
+	dec := gob.NewDecoder(bytes.NewReader(content))
+	cp := Checkpoint{}
+	if err = dec.Decode(cp); err != nil {
+		return nil, err
+	}
+	return cp, nil
+}
+
 // NewService creates a new service, will bypass etcd registration if no
-// endpoints specified.
-func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
+// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
+func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
-		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointInterval: interval,
 		checkpointPath:     path,
 		client:             client,
 	}
@@ -95,9 +131,11 @@ func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkp
 
 	if cp != nil {
 		for _, item := range cp {
-			p := item.ParamConfig
-			st := item.State
-			s.optMap[p.Param.Name] = newOptimizer(p, st)
+			p := ParameterWithConfig{
+				Param:  item.Param,
+				Config: item.Config,
+			}
+			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
 		}
 	}
 	return s, nil
@@ -173,7 +211,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
@@ -181,56 +219,81 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 }
 
 // pserver save checkpoint
-func (s *Service) doCheckpoint() error {
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	cp := make([]parameterCheckpoint, len(s.optMap))
 	index := 0
 	for name, opt := range s.optMap {
-		var pc ParameterCheckpoint
-		pc.ParamConfig.Param.Name = name
-		pc.ParamConfig.Param.ElementType = opt.elementType
-		pc.ParamConfig.Param.Content = opt.GetWeights()
+		var pc parameterCheckpoint
+		pc.Param.Name = name
+		pc.Param.ElementType = opt.elementType
+		pc.Param.Content = opt.GetWeights()
 		pc.State = opt.GetStates()
 		cp[index] = pc
 		index++
 	}
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}
 
 	cpMeta := checkpointMeta{}
 	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
-	cpMeta.Timestamp = time.Now().String()
+	cpMeta.Timestamp = time.Now().UnixNano()
 	h := md5.New()
-	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
+	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
+
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
 
-	cpMetajson, _ := json.Marshal(cpMeta)
-	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
-		return err
+		return
 	}
 	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
 		log.Info("checkpoint does not exists.")
 	} else {
 		err = os.Remove(cpMeta.UUID)
-		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		if err != nil {
+			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
+		} else {
+			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		}
 	}
 	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
 	if err != nil {
-		return err
+		return
 	}
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
+
 	writer := bufio.NewWriter(f)
 	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+
+	return
 }
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 9bf1a48a596f3e3e73a2e4df651855fd5f4e775f..a191f689fea9b5e64204c3ddfd12edf92f5ddb09 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -31,7 +31,7 @@ func TestServiceFull(t *testing.T) {
 
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var p1 pserver.Parameter
@@ -40,40 +40,40 @@ func TestServiceFull(t *testing.T) {
 	p1.ElementType = pserver.Float32
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
 
 	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.SendGrad(g2, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	// don't compare content, since it's already changed by
@@ -82,7 +82,7 @@ func TestServiceFull(t *testing.T) {
 	p.Content = nil
 
 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }
 
@@ -90,16 +90,16 @@ func TestMultipleInit(t *testing.T) {
 	var cp pserver.Checkpoint
 	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
-		t.Error(err)
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err.Error() != pserver.AlreadyInitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
@@ -108,7 +108,7 @@ func TestUninitialized(t *testing.T) {
 	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
 
@@ -154,12 +154,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 
 	wg.Wait()
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db6cf211d8c0b124856ca5c5fd2c49763b1b4a64
--- /dev/null
+++ b/go/utils/networkhelper/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(network_helper_test)
+endif()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 307e99bbe3a833f1fe26057ec38d0b96e04bc0fe..4b06966fba2bc9f92756be0cb8110bbcd5272423 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,13 +8,14 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
-add_subdirectory(optimizer)
 add_subdirectory(string)
 
 if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
+  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index 2f45173bfd401ddda26d61ab7fcfe131d079f710..b6ff6ec7890c0b79d52a2f0784743289c7bc213f 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }
 
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
     void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
 
-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }
 
 std::string OptimizationConfig::toProtoString() {
   return m->getConfig().SerializeAsString();
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 5fb3d1c73bc56e921f13aafd27c25224e259b3fe..0b9b83d42974151d49250bdf0e7c397f59bf6a62 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                                bool useSparseUpdater);
   static ParameterUpdater* createNewRemoteUpdater(
       OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 21b851dd5e26c4752888067b20d0b1e16a4ab52d..120eea3f70125a57fb5ad685f2a11479bce12d0c 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
 
-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
     : m(new ParameterTraverseCallbackPrivate()) {}
 
-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
 
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                       const ParameterConfig& conf,
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 1aaefdfb8107a2eaa0432211fd7df4f5f12d537f..5934cb898b5f6adc74c237b1733a7459d8437a28 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
     OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
   auto updater = new ParameterUpdater();
   updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
   return updater;
 #else
   throw UnsupportError();
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index db8f005929d90f718fc1ad42c60b68108ff55005..500bc448c92630f4fc2f4df603c955e572d868ec 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 
 Vector::Vector() : m(new VectorPrivate()) {}
 
-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }
 
 Vector* Vector::createZero(size_t sz, bool useGpu) {
   auto retVec = new Vector();
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4409c6feae218222b7c0216760cebe4ae8e235cb..eb3416462324edf6f6e76e32d7400d1fd774b9bd 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,18 +1,30 @@
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(enforce SRCS enforce.cc DEPS glog)
+cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_library(tensor SRCS tensor.cc DEPS ddim place enforce paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
+
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
+
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+
+proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
diff --git a/paddle/framework/ddim.cc b/paddle/framework/ddim.cc
index 3f949a6595ea326b97ac567daf9b35a68c8cf7f8..d2ef85afe55e640a17b8c957bac61d175e69ff3f 100644
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -1,9 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/enforce.h"
 
 namespace paddle {
 namespace framework {
 
-///@cond HIDDEN
+/// @cond HIDDEN
 
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
   }
 }
 
-///@endcond
+/// @endcond
 
 DDim make_ddim(std::initializer_list<int> dims) {
   DDim result(make_dim(0));
@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
   return result;
 }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
  public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int& operator()(Dim<D>& dim) const {
@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 
 class DynamicConstIndexer : public boost::static_visitor<int> {
  public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
 
   template <int D>
   int operator()(const Dim<D>& dim) const {
@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
   int idx_;
 };
 
-///@endcond
+/// @endcond
 
 int& DDim::operator[](int idx) {
   return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
   return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
 
+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
   if (var.which() != d.getVar().which()) {
     return false;
@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
 
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
   std::vector<int>& vector;
 
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
 
   void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 
 std::vector<int> vectorize(const DDim& ddim) {
   std::vector<int> result;
@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
   return result;
 }
 
+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
+
 ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
-  std::vector<int> v = vectorize(ddim);
-  for (auto i : v) {
-    result *= i;
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
   }
-  return result;
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
 }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct ArityVisitor : boost::static_visitor<int> {
   template <int D>
@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
   }
 };
 
-///\endcond
+/// \endcond
 
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
 
-///\cond HIDDEN
+/// \cond HIDDEN
 
 struct DDimPrinter : boost::static_visitor<void> {
   std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
 
   template <typename T>
   void operator()(const T& t) {
@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
   }
 };
 
-///\endcond
+/// \endcond
 
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   DDimPrinter printer(os);
@@ -220,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
   return os;
 }
 
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 223c4180bee45e21547364441476b27051daca56..070850375d1bd3a61b98184495c979573bf9542c 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -1,11 +1,26 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
 
 #include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
-
 #include "paddle/framework/dim.h"
+#include "paddle/framework/enforce.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
 namespace framework {
@@ -27,7 +42,9 @@ struct DDim {
   DDim() : var(Dim<1>()) {}
 
   template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
+
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
 
   template <int D>
   DDim& operator=(const Dim<D>& in) {
@@ -57,6 +74,8 @@ struct DDim {
   DDim operator+(DDim d) const;
 
   DDim operator*(DDim d) const;
+
+  ssize_t size() const;
 };
 
 /**
@@ -81,6 +100,15 @@ std::vector<int> vectorize(const DDim& ddim);
 
 ssize_t product(const DDim& ddim);
 
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
 /**
  * \brief What is the length of this dimension?
  *
@@ -91,6 +119,17 @@ int arity(const DDim& ddim);
 
 std::ostream& operator<<(std::ostream&, const DDim&);
 
+template <int NDIMS>
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(const DDim& dims) {
+  int rank = arity(dims);
+  PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same");
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+  for (int d = 0; d < rank; d++) {
+    dsizes[d] = dims[d];
+  }
+  return dsizes;
+}
+
 }  // namespace framework
 }  // namespace paddle
 
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index 36eef02370e0196c2af2c05f49176b70ce69235a..9d18a2972ce62139430b240b4599854b14290a32 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,9 +49,30 @@ TEST(DDim, Equality) {
 
   // arity of a DDim
   EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
 
   // product of a DDim
   EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }
 
 TEST(DDim, Print) {
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index bcde291d12d429a3f2cd41fa6d0ee606c7c9c92f..883fdc55eb929ebc51e8ae05938e9d07374406ce 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -266,29 +266,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
   return ((0 <= idx.head) && (idx.head < size.head));
 }
 
-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
@@ -306,31 +283,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond
 
-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
  * Add two dimensions together
  */
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 809bf04826637195425a32c054c94e00ef940df9..3898d0a447aa502813b3cb5e86c29eebb814ff84 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -1,128 +1,101 @@
 #include <thrust/device_vector.h>
 #include <sstream>
 
-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
 
 __global__ void test(paddle::framework::Dim<2>* o) {
-    o[0] = paddle::framework::make_dim(5, 6);
+  o[0] = paddle::framework::make_dim(5, 6);
 }
 
 __global__ void dyn_idx_gpu(int* o) {
-    auto d = paddle::framework::make_dim(5, 6);
-    o[0] = d[1];
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
 }
 
 TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = paddle::framework::make_dim(3, 4);
-    EXPECT_EQ(paddle::framework::get<0>(a), 3);
-    EXPECT_EQ(paddle::framework::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(paddle::framework::get<0>(a), 5);
-    EXPECT_EQ(paddle::framework::get<1>(a), 6);
-
-    // linearization
-    auto b = paddle::framework::make_dim(7, 8);
-    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(paddle::framework::product(a), 30);
-
-    // mutate a Dim
-    paddle::framework::get<1>(b) = 10;
-    EXPECT_EQ(paddle::framework::get<0>(b), 7);
-    EXPECT_EQ(paddle::framework::get<1>(b), 10);
-
-    // dynamic access
-    paddle::framework::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(paddle::framework::get<0>(b), 8);
-    EXPECT_EQ(paddle::framework::get<1>(b), 11);
-    EXPECT_EQ(paddle::framework::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 12);
-
-    // contiguous_strides
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 0);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 10);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 0);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 6);
-
-    // generate from an index
-    auto size = paddle::framework::make_dim(4, 5, 2);
-    c = paddle::framework::Dim<3>(14, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 2);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::Dim<3>(25, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
 }
 
 TEST(Dim, Bool) {
-    auto a = paddle::framework::make_dim(3, 4);
-    auto b = paddle::framework::make_dim(5, 6);
-    auto c = paddle::framework::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(paddle::framework::contained(a, b));
-    EXPECT_FALSE(paddle::framework::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    paddle::framework::Dim<3> sizef(x, y, z);
-    paddle::framework::Dim<3> stridea(1, x, x*y);
-    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
-    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
 }
 
 TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = paddle::framework::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << paddle::framework::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
 }
diff --git a/paddle/framework/enforce.cc b/paddle/framework/enforce.cc
new file mode 100644
index 0000000000000000000000000000000000000000..644930ff989bb8935f37642c117084f580379bd7
--- /dev/null
+++ b/paddle/framework/enforce.cc
@@ -0,0 +1,15 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/enforce.h"
diff --git a/paddle/framework/enforce.h b/paddle/framework/enforce.h
index 56cb7f95647e81efef58b156002d0d378ee22820..ffce8148e9516a5720757c87685ff6bd2937977c 100644
--- a/paddle/framework/enforce.h
+++ b/paddle/framework/enforce.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include <glog/logging.h>
 #include <paddle/string/printf.h>
 #include <exception>
 #include <sstream>
@@ -58,12 +59,17 @@ class EnforceNotMet : public std::exception {
 /**
  * @brief Enforce a condition, otherwise throw an EnforceNotMet
  */
+#ifdef NDEBUG
 #define PADDLE_ENFORCE(condition, ...) \
   do {                                 \
     if (UNLIKELY(!(condition))) {      \
       PADDLE_THROW(__VA_ARGS__);       \
     }                                  \
   } while (0)
+#else
+#define PADDLE_ENFORCE(condition, ...) \
+  CHECK(condition) << ::paddle::string::Sprintf(__VA_ARGS__);
+#endif
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b9cd732d409e6b8ab6bdddcef810597ac28fba1d
--- /dev/null
+++ b/paddle/framework/net.cc
@@ -0,0 +1,68 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/net.h"
+
+namespace paddle {
+namespace framework {
+
+void PlainNet::CompleteAddOp() {
+  std::unordered_set<std::string> input_set;
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
+  for (auto& op : ops_) {
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
+  }
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
+    }
+    outputs_.push_back(opt);
+    ++idx;
+  }
+
+  attrs_["temporary_index"] = tmp_index;
+  add_op_done_ = true;
+}
+
+std::string PlainNet::DebugString() const {
+  std::ostringstream os;
+  os << this->type_ << ":" << std::endl;
+  for (auto& op : ops_) {
+    os << "\t" << op->DebugString() << std::endl;
+  }
+  return os.str();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
new file mode 100644
index 0000000000000000000000000000000000000000..33bb30ea0767b32e72888c9ff75970f8801f645a
--- /dev/null
+++ b/paddle/framework/net.h
@@ -0,0 +1,105 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
+#include "paddle/framework/net_proto.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/scope.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+/**
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the APIs
+ * it defines.
+ */
+class Net : public OperatorBase {
+ public:
+  virtual void AddOp(const OperatorPtr& op) = 0;
+  virtual void CompleteAddOp() = 0;
+};
+
+using NetPtr = std::shared_ptr<Net>;
+
+/**
+ * @brief a basic implementation of Net.
+ *
+ * PlainNet is a very simple Net, it create a list of operators, and run them
+ * sequentially following the order they added.
+ */
+class PlainNet : public Net {
+ public:
+  /**
+   * Infer all the operators' input and output variables' shapes, will be called
+   * before every mini-batch
+   */
+  void InferShape(const ScopePtr& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
+
+  /**
+   * @brief Add an operator by ptr
+   */
+  void AddOp(const OperatorPtr& op) override {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    ops_.push_back(op);
+  }
+
+  void CompleteAddOp() override;
+
+  std::string DebugString() const override;
+
+  std::vector<OperatorPtr> ops_;
+
+ private:
+  bool add_op_done_{false};
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f5e1c22400a73c3aa09839ef9654f87def99bc77
--- /dev/null
+++ b/paddle/framework/net_op_test.cc
@@ -0,0 +1,67 @@
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+
+namespace pd = paddle::framework;
+
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+
+class TestOp : public pd::OperatorBase {
+ public:
+  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const paddle::framework::ScopePtr& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<paddle::framework::PlainNet>();
+  ASSERT_NE(net, nullptr);
+
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+
+  auto scope = std::make_shared<pd::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+
+  ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
+}
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0779f49fe2a9a6d0d1ea5ec11ba3befeb0a67fa1
--- /dev/null
+++ b/paddle/framework/net_proto.proto
@@ -0,0 +1,15 @@
+syntax="proto2";
+package paddle.framework;
+
+import "op_proto.proto";
+
+message NetDesc {
+  // network identification
+  optional string name = 1;
+  // operator contains in network
+  repeated OpProto operators = 2;
+  // network type to run with. e.g "plainNet", "DAG"
+  optional string net_type = 3;
+  // num worker always
+  optional int32 num_workers = 4;
+}
diff --git a/paddle/framework/net_test.cc b/paddle/framework/net_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8e31c1497519ce60da004bc0a3e52403593497c
--- /dev/null
+++ b/paddle/framework/net_test.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+class FakeFC : public Operator {}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_proto.proto b/paddle/framework/op_proto.proto
index 22df6f9c6b70277ddbf31e0432401889e3aa7483..596b8588e783722362815f75db876931f83484ec 100644
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -34,6 +34,11 @@ message AttrProto {
 
     // Supported attribute comments. It helps 3rd-party language generate doc-string.
     required string comment = 3;
+
+    // If that attribute is generated, it means the Paddle third language
+    // binding has responsibility to fill that attribute. End-User should
+    // not set that attribute.
+    optional bool generated = 4 [default=false];
 }
 
 // Input or output message for 3rd-party language binding.
@@ -45,6 +50,40 @@ message VarProto {
 
     // The comment for that input. It helps 3rd-party language generate doc-string.
     required string comment = 2;
+
+    // Is that input/output could be a list or not.
+    // If so, that Op should write a attributed named `input_format` or
+    // `output_format`.
+    //
+    // e.g.
+    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+    //   will hold a attribute of them.
+    //
+    //   The Op desc of same fc could be
+    //   {
+    //      "type": "fc",
+    //      "input": ["X1", "X2", "W1", "W2", "b"],
+    //      "output": "fc.out",
+    //      "attrs" : {
+    //        "input_format": [0, 2, 4, 5]
+    //      }
+    //   }
+    //
+    optional bool multiple = 3 [default=false];
+
+    // It marks that output is a temporary output. That output is not used by
+    // user, but used by other op internally as input. If other op is not use
+    // that output, it could be optimized early.
+    //
+    // Attribute temporary_index will be set in OpDesc if there is some
+    // outputs are temporary.
+    //
+    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+    // attrs = {
+    //   "temporary_index": [1]
+    // }
+    optional bool temporary = 4 [default=false];
 }
 
 // Op protocol message for 3rd-party language binding.
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1d14535c50b542733663a6900a8b5f2033290ea6
--- /dev/null
+++ b/paddle/framework/op_registry.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+
+namespace paddle {
+namespace framework {
+
+template <>
+void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRING);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INTS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOATS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRINGS);
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 81241b5342d8900c205dd62f2a62dc2496010560..c41fe10729501698fd07f59456f64ac26df77f08 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,26 +1,18 @@
 #pragma once
 
+#include <algorithm>
+#include <atomic>
+#include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include "paddle/framework/attr_checker.h"
-
-//#include "paddle/framework/op_base.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
 
-//==================For test================//
-class OpBase {
- public:
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
-  AttributeMap attr_map_;
-
-  virtual std::string Run() const = 0;
-  virtual ~OpBase() {}
-};
-//=========================================//
-
 // helper class to set attribute type
 struct AttrTypeHelper {
   template <typename T>
@@ -64,190 +56,344 @@ struct AttrTypeHelper {
   }
 };
 
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
-
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
   OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
       : proto_(proto), op_checker_(op_checker) {}
 
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+
+  void Validate() {
+    validated_ = true;
+    CheckNoDuplicatedInOutAttrs();
+  }
+
  protected:
-  void AddInput(const std::string& name, const std::string& comment) {
+  void AddInput(const std::string& name, const std::string& comment,
+                bool multiple = false) {
     auto input = proto_->mutable_inputs()->Add();
-    *(input->mutable_name()) = name;
-    *(input->mutable_comment()) = comment;
+    *input->mutable_name() = name;
+    *input->mutable_comment() = comment;
+    input->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleInput();
+    }
+  }
+
+  void AddInputs(const std::string& name, const std::string& comment) {
+    AddInput(name, comment, true);
   }
 
-  void AddOutput(const std::string& name, const std::string& comment) {
+  void AddOutput(const std::string& name, const std::string& comment,
+                 bool temporary = false, bool multiple = false) {
     auto output = proto_->mutable_outputs()->Add();
-    *(output->mutable_name()) = name;
-    *(output->mutable_comment()) = comment;
+    *output->mutable_name() = name;
+    *output->mutable_comment() = comment;
+    output->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleOutput();
+    }
+    output->set_temporary(temporary);
+    if (temporary) {
+      SetHasTemporaryOutput();
+    }
+  }
+
+  void AddOutputs(const std::string& name, const std::string& comment,
+                  bool temporary = false) {
+    AddOutput(name, comment, temporary, true);
   }
 
   template <typename T>
   TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment) {
+                               const std::string& comment,
+                               bool generated = false) {
     auto attr = proto_->mutable_attrs()->Add();
-    *(attr->mutable_name()) = name;
-    *(attr->mutable_comment()) = comment;
+    *attr->mutable_name() = name;
+    *attr->mutable_comment() = comment;
+    attr->set_generated(generated);
     AttrTypeHelper::SetAttrType<T>(attr);
     return op_checker_->AddAttrChecker<T>(name);
   }
 
-  void AddType(const std::string& op_type) { proto_->set_type(op_type); }
-
   void AddComment(const std::string& comment) {
     *(proto_->mutable_comment()) = comment;
   }
 
+ private:
+  void SetHasMultiple(const std::string& in_out, bool* flag) {
+    if (!*flag) {
+      AddAttr<std::vector<int>>(in_out + "_format",
+                                "The multiple index of " + in_out +
+                                    "\n"
+                                    R"DOC(
+This attribute is used by Paddle core framework. Paddle's Op support each input
+or output could be a list of variable. This attribute is used to show how that
+list organized.
+
+e.g.
+  input = ["a", "b", "c", "d", "e", "f"]
+  input_format = [0, 4, 5, 6]
+
+means
+  The number of all input variables this op is six, and they are segmented into
+  three inputs.
+
+  The first input is input[0:4], second is input[4:5], third is input[5:6].
+)DOC",
+                                /*generated*/ true);
+      *flag = true;
+    }
+  }
+
+  void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); }
+  void SetHasMultipleOutput() {
+    SetHasMultiple("output", &has_multiple_output_);
+  }
+
+  void SetHasTemporaryOutput() {
+    if (!has_temporary_output_) {
+      AddAttr<std::vector<int>>("temporary_index",
+                                R"DOC(The temporary index of output.
+
+Not all output of Paddle Op is used by user. For faster computation, each op
+could output some its internal state to other op, other op could take that
+output to make compute faster.
+
+Add a mark to which output is temporary is helpful for future optimization.
+)DOC",
+                                /*generated*/ true)
+          .SetDefault(std::vector<int>());
+      has_temporary_output_ = true;
+    }
+  }
+
+  void CheckNoDuplicatedInOutAttrs() {
+    std::unordered_set<std::string> names;
+    auto checker = [&](const std::string& name) {
+      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+      names.insert(name);
+    };
+    for (auto& attr : proto_->attrs()) {
+      checker(attr.name());
+    }
+    for (auto& input : proto_->inputs()) {
+      checker(input.name());
+    }
+    for (auto& output : proto_->outputs()) {
+      checker(output.name());
+    }
+  }
+
   OpProto* proto_;
   OpAttrChecker* op_checker_;
+  bool validated_{false};
+  bool has_multiple_input_{false};
+  bool has_multiple_output_{false};
+  bool has_temporary_output_{false};
 };
 
 class OpRegistry {
-  typedef std::function<OpBase*()> OpCreator;
+  using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;
+  using VarNameList = std::vector<std::string>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
   static void RegisterOp(const std::string& op_type) {
-    creators_[op_type] = []() { return new OpType; };
-    OpProto& op_proto = protos_[op_type];
-    OpAttrChecker& op_checker = op_checkers_[op_type];
-    ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized() == true,
-                   "Fail to initialize %s's OpProto !", op_type);
-  }
-
-  static OpBase* CreateOp(const OpDesc& op_desc) {
-    std::string op_type = op_desc.type();
-    OpBase* op = (creators_.at(op_type))();
-    (op->inputs_).resize(op_desc.inputs_size());
-    for (int i = 0; i < op_desc.inputs_size(); ++i) {
-      (op->inputs_)[i] = op_desc.inputs(i);
-    }
-    (op->outputs_).resize(op_desc.outputs_size());
-    for (int i = 0; i < op_desc.outputs_size(); ++i) {
-      (op->outputs_)[i] = op_desc.outputs(i);
+    creators()[op_type] = [] { return new OpType; };
+    OpProto& op_proto = protos()[op_type];
+    OpAttrChecker& op_checker = op_checkers()[op_type];
+    auto maker = ProtoMakerType(&op_proto, &op_checker);
+    maker.Validate();
+    *op_proto.mutable_type() = op_type;
+    PADDLE_ENFORCE(
+        op_proto.IsInitialized(),
+        "Fail to initialize %s's OpProto, because %s is not initialized",
+        op_type, op_proto.InitializationErrorString());
+
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
     }
-    for (int i = 0; i < op_desc.attrs_size(); ++i) {
-      const AttrDesc& ith_attr = op_desc.attrs(i);
-      std::string name = ith_attr.name();
-      (op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
     }
-    const OpAttrChecker& op_checker = op_checkers_.at(op_type);
-    op_checker.Check(op->attr_map_);
-    return op;
   }
 
- private:
-  static std::unordered_map<std::string, OpCreator> creators_;
-  static std::unordered_map<std::string, OpProto> protos_;
-  static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
-};
-
-std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
-std::unordered_map<std::string, OpProto> OpRegistry::protos_;
-std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
+  static OperatorPtr CreateOp(const std::string& type,
+                              const VarNameList& inputs,
+                              const VarNameList& outputs,
+                              const AttributeMap& attrs) {
+    auto op_create_it = creators().find(type);
+    PADDLE_ENFORCE(op_create_it != creators().end(),
+                   "Operator %s cannot be found", type);
+
+    auto op = op_create_it->second();
+    op->type_ = type;
+    op->inputs_ = inputs;
+    op->outputs_ = outputs;
+    op->attrs_ = attrs;
+    op_checkers().at(type).Check(op->attrs_);
+
+    GenerateTempVariableName(op);
+
+    {
+      auto var_index_it = VarIndexMaps().find(type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
+    }
 
-template <typename OpType, typename ProtoMakerType>
-class OpRegisterHelper {
- public:
-  OpRegisterHelper(std::string op_type) {
-    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
+    op->Init();
+    return OperatorPtr(op);
   }
-};
 
-#define REGISTER_OP(__op_class, __op_maker_class, __op_type)         \
-  class __op_class##Register {                                       \
-   private:                                                          \
-    const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
-  };                                                                 \
-  const OpRegisterHelper<__op_class, __op_maker_class>               \
-      __op_class##Register::reg(#__op_type);
+  static OperatorPtr CreateOp(const OpDesc& op_desc) {
+    std::vector<std::string> inputs;
+    inputs.reserve((size_t)op_desc.inputs_size());
+    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
+              std::back_inserter(inputs));
 
-// Demos
+    std::vector<std::string> outputs;
+    outputs.reserve((size_t)op_desc.outputs_size());
+    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
+              std::back_inserter(outputs));
 
-class CosineOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg = "CosineOp runs! scale = " +
-                      std::to_string(boost::get<float>(attr_map_.at("scale")));
-    return msg;
+    AttributeMap attrs;
+    for (auto& attr : op_desc.attrs()) {
+      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+    }
+
+    return CreateOp(op_desc.type(), inputs, outputs, attrs);
   }
-};
 
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
+
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
   }
-};
 
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+  static void GenerateTempVariableName(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->type_;
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
 
-class MyTestOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg =
-        "MyTestOp runs! test_attr = " +
-        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
-    return msg;
+  static std::unordered_map<std::string, OpCreator>& creators() {
+    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
   }
+
+  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
+    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+    return op_checkers_;
+  };
 };
 
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+template <typename OpType, typename ProtoMakerType>
+class OpRegisterHelper {
  public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
+  OpRegisterHelper(const char* op_type) {
+    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
   }
 };
 
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
+#define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
+  struct __test_global_namespace_##uniq_name##__ {};                          \
+  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
+                             __test_global_namespace_##uniq_name##__>::value, \
+                msg)
+
+/**
+ * Macro to Register Operator.
+ */
+#define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
+                                 "REGISTER_OP must be in global namespace"); \
+  static ::paddle::framework::OpRegisterHelper<__op_class, __op_maker_class> \
+      __op_register_##__op_type##__(#__op_type);                             \
+  int __op_register_##__op_type##_handle__() { return 0; }
+
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
+      "REGISTER_OP_KERNEL must be in global namespace");                  \
+  struct __op_kernel_register__##type##__ {                               \
+    __op_kernel_register__##type##__() {                                  \
+      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
+      key.place_ = PlaceType();                                           \
+      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
+          .reset(new __VA_ARGS__());                                      \
+    }                                                                     \
+  };                                                                      \
+  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
+
+// (type, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+
+// (type, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+
+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
+#define USE_OP_WITHOUT_KERNEL(op_type)                      \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
+      __use_op_without_kernel_##op_type,                    \
+      "USE_OP_WITHOUT_KERNEL must be in global namespace"); \
+  extern int __op_register_##op_type##_handle__();          \
+  static int __use_op_ptr_##op_type##_without_kernel__      \
+      __attribute__((unused)) = __op_register_##op_type##_handle__()
+
+#define USE_OP_KERNEL(op_type, DEVICE_TYPE)                               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
+      "USE_OP_KERNEL must be in global namespace");                       \
+  extern int __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__(); \
+  static int __use_op_ptr_##op_type##_##DEVICE_TYPE##_kernel__            \
+      __attribute__((unused)) =                                           \
+          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
+
+// use Operator with only cpu kernel.
+#define USE_OP_CPU(op_type)       \
+  USE_OP_WITHOUT_KERNEL(op_type); \
+  USE_OP_KERNEL(op_type, CPU)
+
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
+#else
+#define USE_OP(op_type) \
+  USE_OP_CPU(op_type);  \
+  USE_OP_KERNEL(op_type, GPU)
+#endif
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 17849ca0191db644884e766342b30461abf50298..d3a51a361aa56b26b87d79057f6700bd87264ca4 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,25 +1,79 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
+namespace pd = paddle::framework;
+
+namespace paddle {
+namespace framework {
+class CosineOp : public OperatorBase {
+ public:
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+  void InferShape(const ScopePtr& scope) const override {}
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is cos op");
+  }
+};
+
+class MyTestOp : public OperatorBase {
+ public:
+  void InferShape(const ScopePtr& scope) const override {}
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInputs("input", "input of cosine op");
+    AddOutput("output", "output of cosine op",
+              /*temporary*/ true);
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddComment("This is my_test op");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(cos_sim, paddle::framework::CosineOp,
+            paddle::framework::CosineOpProtoAndCheckerMaker);
+REGISTER_OP(my_test_op, paddle::framework::MyTestOp,
+            paddle::framework::MyTestOpProtoAndCheckerMaker);
+
 TEST(OpRegistry, CreateOp) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
+  float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.3);
+  attr->set_f(scale);
 
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
+  float scale_get = op->GetAttr<float>("scale");
+  ASSERT_EQ(scale_get, scale);
 }
 
 TEST(OpRegistry, IllegalAttr) {
@@ -35,7 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -54,15 +108,22 @@ TEST(OpRegistry, DefaultValue) {
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
-  paddle::framework::OpBase* op =
+  ASSERT_TRUE(op_desc.IsInitialized());
+
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  float default_value = 1.0;
-  std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
+  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
+}
+
+static void SetInputFormat(paddle::framework::OpDesc* desc) {
+  auto attr = desc->add_attrs();
+  attr->set_name("input_format");
+  attr->set_type(paddle::framework::INTS);
+  attr->mutable_ints()->Add(0);
+  attr->mutable_ints()->Add(1);
 }
 
 TEST(OpRegistry, CustomChecker) {
@@ -70,11 +131,12 @@ TEST(OpRegistry, CustomChecker) {
   op_desc.set_type("my_test_op");
   op_desc.add_inputs("ii");
   op_desc.add_outputs("oo");
+  SetInputFormat(&op_desc);
 
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -93,7 +155,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -111,12 +173,44 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
-  paddle::framework::OpBase* op =
+  SetInputFormat(&op_desc);
+  paddle::framework::OperatorPtr op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
+  paddle::platform::CPUDeviceContext dev_ctx;
+  auto scope = std::make_shared<paddle::framework::Scope>();
+  op->Run(scope, dev_ctx);
+  int test_attr = op->GetAttr<int>("test_attr");
+  ASSERT_EQ(test_attr, 4);
+}
+
+class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+
+TEST(ProtoMaker, DuplicatedAttr) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
+}
+
+class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
   }
-}
\ No newline at end of file
+};
+
+TEST(ProtoMaker, DuplicatedInOut) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1e57e9a20f3eecfac266d67276347ad4b5b780f9
--- /dev/null
+++ b/paddle/framework/operator.cc
@@ -0,0 +1,103 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <algorithm>
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+template <>
+Eigen::DefaultDevice* KernelContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
+  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice*
+KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+}
+#endif
+
+const std::string& OperatorBase::Input(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+
+  if (attrs_.count("input_format") == 0) {
+    return inputs_[it->second];
+  } else {
+    const auto& input_format = GetAttr<std::vector<int>>("input_format");
+    int idx = input_format[it->second];
+    return inputs_.at(idx);
+  }
+}
+
+std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  auto input_format = GetAttr<std::vector<int>>("input_format");
+  auto offset = in_out_idxs_->at(name);
+
+  return std::vector<std::string>{
+      inputs_.begin() + input_format.at(offset),
+      inputs_.begin() + input_format.at(offset + 1)};
+}
+
+const std::string& OperatorBase::Output(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+
+  if (attrs_.count("output_format") == 0) {
+    return outputs_[it->second];
+  } else {
+    const auto& output_format = GetAttr<std::vector<int>>("output_format");
+    int idx = output_format[it->second];
+    return outputs_.at(idx);
+  }
+}
+
+std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  auto output_format = GetAttr<std::vector<int>>("output_format");
+  auto offset = in_out_idxs_->at(name);
+
+  return std::vector<std::string>{
+      outputs_.begin() + output_format.at(offset),
+      outputs_.begin() + output_format.at(offset + 1)};
+}
+
+std::string OperatorBase::DebugString() const {
+  std::stringstream ss;
+  ss << "Op(" << type_ << "), inputs:(";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
+    ss << inputs_[i];
+    if (i != inputs_.size() - 1) {
+      ss << ", ";
+    }
+  }
+  ss << "), outputs:(";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
+    ss << outputs_[i];
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
+    }
+  }
+  ss << ").";
+  return ss.str();
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f046d6293d5dbb9fd594b0c13aa8d62012cf915
--- /dev/null
+++ b/paddle/framework/operator.h
@@ -0,0 +1,254 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <boost/variant.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+struct EigenDeviceConverter;
+
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+
+class OperatorBase;
+using OperatorPtr = std::shared_ptr<OperatorBase>;
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
+
+  virtual ~OperatorBase() {}
+
+  template <typename T>
+  inline const T& GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+  virtual std::string DebugString() const;
+
+  /// Init will be called after CreateOperator, you can put some initialization
+  /// logic here.
+  virtual void Init() {}
+
+  /// InferShape infer the size of Variables used by this Operator with
+  /// information inside scope
+  virtual void InferShape(const ScopePtr& scope) const = 0;
+
+  /// Net will call this function to Run an op.
+  virtual void Run(const ScopePtr& scope,
+                   const platform::DeviceContext& dev_ctx) const = 0;
+
+  // Get a input with argument's name described in `op_proto`
+  const std::string& Input(const std::string& name) const;
+  // Get a input which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Inputs(const std::string& name) const;
+  // Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  // Get an output which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Outputs(const std::string& name) const;
+
+ public:
+  std::string type_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  AttributeMap attrs_;
+  // store the arguments' offset described in op_desc.
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
+};
+
+class KernelContext {
+ public:
+  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                const platform::DeviceContext& device_context)
+      : op_(*op), scope_(scope), device_context_(device_context) {}
+
+  const Variable* Input(int index) const {
+    return scope_->GetVariable(op_.inputs_[index]);
+  }
+
+  Variable* Output(int index) const {
+    return scope_->GetVariable(op_.outputs_[index]);
+  }
+
+  const Variable* Input(const std::string& name) const {
+    return scope_->GetVariable(op_.Input(name));
+  }
+
+  const Variable* Output(const std::string& name) const {
+    return scope_->GetVariable(op_.Output(name));
+  }
+
+  const std::vector<const Variable*> Inputs(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+
+  const std::vector<const Variable*> Outputs(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType* GetEigenDevice() const;
+
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+
+  const OperatorBase& op_;
+  const std::shared_ptr<Scope>& scope_;
+  const platform::DeviceContext& device_context_;
+};
+
+class OpKernel {
+ public:
+  /**
+   * KernelContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * KernelContext. User should construct it before run the Operator.
+   */
+
+  virtual void Compute(const KernelContext& context) const = 0;
+
+  virtual ~OpKernel() {}
+};
+
+template <typename T>
+struct VarToTensor {};
+
+template <>
+struct VarToTensor<Tensor*> {
+  Tensor* operator()(Variable* var) { return var->GetMutable<Tensor>(); }
+};
+
+template <>
+struct VarToTensor<const Tensor*> {
+  const Tensor* operator()(Variable* var) { return &var->Get<Tensor>(); }
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
+  struct OpKernelKey {
+    platform::Place place_;
+
+    OpKernelKey() = default;
+    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+      place_ = dev_ctx.GetPlace();
+    }
+
+    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+  };
+
+  struct OpKernelHash {
+    std::hash<bool> hash_;
+    size_t operator()(const OpKernelKey& key) const {
+      return hash_(platform::is_gpu_place(key.place_));
+    }
+  };
+
+  using OpKernelMap =
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const final {
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
+    opKernel->Compute(KernelContext(this, scope, dev_ctx));
+  }
+
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  }
+
+  void InferShape(const std::shared_ptr<Scope>& scope) const final {
+    std::vector<const Tensor*> ins;
+    VarNamesToTensors(scope, inputs_, &ins);
+    std::vector<Tensor*> outs;
+    VarNamesToTensors(scope, outputs_, &outs);
+    InferShape(ins, outs);
+  };
+
+ private:
+  template <typename T>
+  void VarNamesToTensors(const std::shared_ptr<Scope>& scope,
+                         const std::vector<std::string>& var_names,
+                         std::vector<T>* container) const {
+    container->reserve(var_names.size());
+    VarToTensor<T> convert;
+    for (auto& name : var_names) {
+      auto var = scope->GetVariable(name);
+      if (var != nullptr) {
+        container->push_back(convert(var));
+      } else {
+        container->push_back(nullptr);
+      }
+    }
+  }
+
+ protected:
+  virtual void InferShape(const std::vector<const Tensor*>& inputs,
+                          const std::vector<Tensor*>& outputs) const = 0;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8e55d0111f39b2f632cf5a49c2ad3f210683652c
--- /dev/null
+++ b/paddle/framework/operator_test.cc
@@ -0,0 +1,241 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/operator.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const ScopePtr& scope) const override {}
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    op_run_num++;
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+  }
+
+ public:
+  float x = 0;
+};
+
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op");
+    AddComment("This is test op");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+
+TEST(OperatorBase, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUDeviceContext device_context;
+  auto scope = std::make_shared<paddle::framework::Scope>();
+
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope->CreateVariable("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->Run(scope, device_context);
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
+}
+
+namespace paddle {
+namespace framework {
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+static int cpu_kernel_run_num = 0;
+
+class OpWithKernelTest : public OperatorWithKernel {
+ protected:
+  void InferShape(const std::vector<const Tensor*>& inputs,
+                  const std::vector<Tensor*>& outputs) const override {}
+};
+
+template <typename T1, typename T2>
+class CPUKernelTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& ctx) const {
+    std::cout << "this is cpu kernel" << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
+  }
+};
+
+// multiple inputs test
+class OperatorMultiInputsTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_EQ(Input("x"), "IN1");
+    ASSERT_EQ(Input("y"), "OUT1");
+  }
+
+ public:
+  float x = 0;
+};
+
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInputs("xs", "inputs of test op");
+    AddInput("k", "input of test op");
+    AddOutputs("ys", "outputs of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
+  }
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+
+// test with single input
+TEST(OpKernel, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("op_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<paddle::framework::Scope>();
+
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
+  op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
+
+REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "x0";
+  *op_desc.mutable_inputs()->Add() = "x1";
+  *op_desc.mutable_inputs()->Add() = "x2";
+  *op_desc.mutable_inputs()->Add() = "k0";
+  *op_desc.mutable_outputs()->Add() = "y0";
+  *op_desc.mutable_outputs()->Add() = "y1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  auto attr0 = op_desc.mutable_attrs()->Add();
+  attr0->set_name("input_format");
+  attr0->set_type(paddle::framework::AttrType::INTS);
+  auto input_format = attr0->mutable_ints();
+  input_format->Add(0);  // x0
+  input_format->Add(3);  // k
+  input_format->Add(4);  // end
+
+  auto attr1 = op_desc.mutable_attrs()->Add();
+  attr1->set_name("output_format");
+  attr1->set_type(paddle::framework::AttrType::INTS);
+  auto output_format = attr1->mutable_ints();
+  output_format->Add(0);  // y0
+  output_format->Add(2);  // y1
+
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+
+  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
+  op->Run(scope, cpu_device_context);
+}
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index a4470f726fb0d59a82db29b3239c111ea1569c55..ec62c9189fd2a5ea74c6d6e5635a4d500e4823e2 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -23,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+class Scope;
+using ScopePtr = std::shared_ptr<Scope>;
+
 /**
  * @brief Scope that manage all variables.
  *
@@ -41,7 +44,7 @@ class Scope {
   /**
    * @brief Initialize a Scope with parent.
    */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
+  explicit Scope(const ScopePtr& parent) : parent_(parent) {}
 
   /**
    * @brief Create Variable
@@ -88,7 +91,7 @@ class Scope {
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  ScopePtr parent_{nullptr};
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
new file mode 100644
index 0000000000000000000000000000000000000000..964f15ab66bca7da75824e192e61600c29e572c0
--- /dev/null
+++ b/paddle/framework/tensor.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/tensor.h>
+
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index ce5d98b04e6b53fcedc4fc4610d9390e64846b2a..1dd421cdb681e15486e309ff912574af35b5a0c2 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -14,84 +14,245 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdint>
+#include <cstring>
 #include <memory>
-#include <type_traits>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
+#include "paddle/framework/tensor_types.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
 namespace framework {
 
 class Tensor {
  public:
+  Tensor() : offset_(0) {}
+
   template <typename T>
   const T* data() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tensor::data must be called after Tensor::mutable_data.");
-    return static_cast<const T*>(holder_->Ptr());
+    CheckDims<T>();
+    return reinterpret_cast<const T*>(
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+  }
+
+  template <typename T>
+  T* raw_data() const {
+    CheckDims<T>();
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                offset_);
+  }
+
+  template <typename T>
+  T* mutable_data(DDim dims, platform::Place place) {
+    set_dims(dims);
+    return mutable_data<T>(place);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
+  template <typename T>
+  T* mutable_data(platform::Place place) {
+    PADDLE_ENFORCE(product(dims_) > 0,
+                   "Tensor's numel must be larger than zero to call "
+                   "Tensor::mutable_data. Call Tensor::set_dim first.");
     if (holder_ == nullptr ||
-        !(holder_->Place() ==
+        !(holder_->place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T)) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#else
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
+#endif
+      } else {
+        PADDLE_THROW("Unknown 'place'.");
+      }
+      offset_ = 0;
+    }
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                offset_);
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(raw_data<T>(), dims);
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor tensor() {
+    return typename TTypes<T, NDIMS>::Tensor(
+        raw_data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+
+  // flat to rank = 1
+  template <typename T>
+  typename TTypes<T>::Flat flat() {
+    return shaped<T, 1>(make_ddim({static_cast<int>(product(dims_))}));
+  }
+
+  // to TensorType Vec
+  template <typename T>
+  typename TTypes<T>::Vec vec() {
+    return tensor<T, 1>();
+  }
+
+  // to TensorType Matrix
+  template <typename T>
+  typename TTypes<T>::Matrix matrix() {
+    return tensor<T, 2>();
+  }
+
+  // const versions of all the methods above.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) const {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(data<T>(), dims);
+  }
+
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstantTensor tensor() const {
+    return typename TTypes<T, NDIMS>::Tensor(
+        data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+
+  template <typename T>
+  typename TTypes<T>::ConstFlat flat() const {
+    return shaped<T, 1>(make_ddim({static_cast<int>(product(dims_))}));
+  }
+
+  template <typename T>
+  typename TTypes<T>::ConstVec vec() const {
+    return tensor<T, 1>();
+  }
+
+  template <typename T>
+  typename TTypes<T>::ConstMatrix matrix() const {
+    return tensor<T, 2>();
+  }
+
+  template <typename T>
+  void ShareDataFrom(const Tensor& src) {
+    src.CheckDims<T>();
+    holder_ = src.holder_;
+    set_dims(src.dims());
+    offset_ = src.offset_;
+  }
+
+  template <typename T>
+  void CopyFrom(const Tensor& src, platform::Place dst_place) {
+    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
+                       platform::is_cpu_place(dst_place),
+                   "Tensor::CopyFrom only support CPU now.");
+    src.CheckDims<T>();
+    size_t size = product(src.dims_) * sizeof(T);
+    set_dims(src.dims());
+    const void* src_ptr = static_cast<const void*>(src.data<T>());
+    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+    memcpy(dst_ptr, src_ptr, size);
+  }
+
+  template <typename T>
+  Tensor Slice(const int& begin_idx, const int& end_idx) const {
+    CheckDims<T>();
+    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
+                   "Slice index is less than zero or out of bound.");
+    PADDLE_ENFORCE(begin_idx < end_idx,
+                   "Begin index must be less than end index.");
+    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+    std::vector<int> d = vectorize(dims_);
+    int base = 1;
+    for (size_t i = 1; i < d.size(); ++i) {
+      base *= d[i];
     }
-    return static_cast<T*>(holder_->Ptr());
+    Tensor dst;
+    dst.holder_ = holder_;
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.set_dims(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+    return dst;
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims) {
-    return mutable_data<T>(dims, paddle::platform::get_place());
+  void set_dims(const DDim& dims) {
+    if (dims == dims_) {
+      return;
+    }
+    dims_ = dims;
   }
 
+  DDim dims() const { return dims_; }
+
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
   struct Placeholder {
     virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
-    virtual size_t Size() const = 0;
+    virtual void* ptr() const = 0;
+    virtual platform::Place place() const = 0;
+    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
   };
 
-  template <typename T>
+  template <typename T, typename PlaceType>
   struct PlaceholderImpl : public Placeholder {
    private:
+    template <typename PType>
     class Deleter {
      public:
-      Deleter(platform::Place place) : place_(place) {}
-      void operator()(T* ptr) {
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
+      Deleter(PType place) : place_(place) {}
+      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
 
      private:
-      paddle::platform::Place place_;
+      PType place_;
     };
 
    public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
-               Deleter(place)),
+    PlaceholderImpl(PlaceType place, size_t size)
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
+               Deleter<PlaceType>(place)),
           place_(place),
           size_(size) {}
 
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual size_t size() const { return size_; }
+    virtual paddle::platform::Place place() const { return place_; }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
-    std::unique_ptr<T, Deleter> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
+    platform::Place place_;  // record the place of ptr_.
+    size_t size_;            // size of the memory block.
   };
 
+  template <typename T>
+  inline void CheckDims() const {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                   "first to re-allocate memory.");
+  }
+
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  DDim dims_;
+  size_t offset_;  // marks the begin of tensor data area.
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 727d81f8d72e39ec564c42a48bf7ff64204adfff..84c6f0cf6558819440458688ca52b06c1cf11dd0 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,15 +15,28 @@
 #include <gtest/gtest.h>
 #include <string>
 
-TEST(Tensor, ASSERT) {
-  paddle::framework::Tensor cpu_tensor;
+TEST(Tensor, Dims) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor tt;
+  tt.set_dims(make_ddim({2, 3, 4}));
+  DDim dims = tt.dims();
+  ASSERT_EQ(arity(dims), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(i + 2, dims[i]);
+  }
+}
+
+TEST(Tensor, DataAssert) {
+  paddle::framework::Tensor src_tensor;
 
   bool caught = false;
   try {
-    const double* p __attribute__((unused)) = cpu_tensor.data<double>();
+    src_tensor.data<double>();
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
-    std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
+    std::string msg =
+        "Tenosr holds no memory. Call Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -32,54 +45,168 @@ TEST(Tensor, ASSERT) {
   ASSERT_TRUE(caught);
 }
 
-/* mutable_data() is not tested at present
+/* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
-
+*/
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   {
-    Tensor cpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set cpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set cpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
     EXPECT_EQ(p1, p2);
-    // set cpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-
+#ifdef __CUDACC__
   {
-    Tensor gpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set gpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set gpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
     EXPECT_EQ(p1, p2);
-    // set gpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
     EXPECT_EQ(p1, p2);
   }
+#endif
+}
+
+TEST(Tensor, ShareDataFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    // Try to share data form uninitialized tensor
+    bool caught = false;
+    try {
+      dst_tensor.ShareDataFrom<float>(src_tensor);
+    } catch (EnforceNotMet err) {
+      caught = true;
+      std::string msg =
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
+      }
+    }
+    ASSERT_TRUE(caught);
+
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    dst_tensor.ShareDataFrom<int>(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    dst_tensor.ShareDataFrom<int>(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+#endif
+}
+
+TEST(Tensor, Slice) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 3);
+    EXPECT_EQ(slice_dims[0], 2);
+    EXPECT_EQ(slice_dims[1], 3);
+    EXPECT_EQ(slice_dims[2], 4);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
+  }
+
+#ifdef __CUDACC__
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+#endif
+}
+
+TEST(Tensor, CopyFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  Tensor src_tensor;
+  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  Tensor dst_tensor;
+  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
 }
-*/
diff --git a/paddle/framework/tensor_types.h b/paddle/framework/tensor_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4bf27a377e828a56f9679e6698d314457d7caf0b
--- /dev/null
+++ b/paddle/framework/tensor_types.h
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// Helper to define Tensor types given that the scalar is of type T.
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
+struct TTypes {
+  // Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Tensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstTensor;
+
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
+      Eigen::Aligned>
+      Scalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      ConstScalar;
+
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Flat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Vec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstVec;
+
+  // Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Matrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstMatrix;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 138f7dcf1680d42ee5b328996b2abb8b32a93a75..74bf29d63ec7cef1822d1e30c143e472c8af5c72 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
   list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
   if(WITH_TESTING)
     add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index a40e5d9d2e76605525f0956445fc43c693933cf8..00880effc59cc80b2761fb6a4d9f3246439afd3f 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,8 +117,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
@@ -217,8 +216,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& input = outputs[0].shape();
@@ -311,8 +309,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& output = inputs[0].shape();
     const TensorShape& input = inputs[1].shape();
     const TensorShape& filter = outputs[0].shape();
diff --git a/paddle/function/NaiveConvOp.cpp b/paddle/function/NaiveConvOp.cpp
index 4348f0f775e9442c50a3c45b9a8e6dad5c6b198d..e0692fa06d6e0c35cfa742ca3eac7fe2037b1a80 100644
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
     ConvFunctionBase::init(config);
   }
 
-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
     const TensorShape& input = inputs[0].shape();
     const TensorShape& filter = inputs[1].shape();
     const TensorShape& output = outputs[0].shape();
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
index c0b947e224313abaf4fadfb8293dc78ca085ff84..d9dcc7d59d1e3c222f5a7ce448daa8d7edb6c978 100644
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
       __syncthreads();
       if (tidy < (context - 1)) {
         yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
       }
       __syncthreads();
 
@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
       int yoff = start + j;
 
       // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
       __syncthreads();
 
       for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
         __syncthreads();
 
         real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
   for (int i = tidy; i < context; i += blky) {
     sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
   }
-  
+
   __syncthreads();
 
   for (int i = 0; i < numSeq; ++i) {
@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
     dim3 dimBlock(32, 32);
     dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
     real* dw = filterG.getData();
-    if (contextLength <= 32) { 
+    if (contextLength <= 32) {
       KeRowConvBwWeight<32, 32, 32>
         <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
         (dw, x, dy, starts, height, width, numSeq, contextLength);
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index e8080c3d714b324f072a380f738b9764477dfe04..f0ec77a5d00333993427fb8d0bc938c884e50c95 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
             "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
              0,
@@ -58,18 +58,10 @@ public:
     workspaceBuffer_ = nullptr;
     workspaceSize_ = 0;
 
-    threadpool_ = nullptr;
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
+    create_nnpack_threadpool();
   }
 
   ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
     if (workspaceBuffer_) {
       free(workspaceBuffer_);
     }
@@ -225,14 +217,25 @@ public:
     }
   }
 
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
 private:
   nnp_convolution_algorithm algorithm_;
   nnp_convolution_transform_strategy transform_strategy_;
   void* workspaceBuffer_;
   size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
 
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
+
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 
 }  // namespace paddle
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 40036762179ebb1495b90907f16b97e3c60c50d8..265dbb54933540ff8b0d1e2e2d985b4b7fa51ecd 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
       hl_destroy_event(hlEvent_);
       hlEvent_ = NULL;
     }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
   }
 
   void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 2e839f640503b8f4e390fc87d9d59960dbc37f6e..cfa80a89365af5111746eec9599d16e37532a9f7 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
       : layerName_(layerName) {
     addEvaluator(std::move(evaluator));
   }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
     const LayerPtr& layer = nn.getLayer(layerName_);
     CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                  << nn.getName();
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 9a972466d66ba1417b2c31e66dc375b3da229aa8..9ddd449de7500f5682d59469328f06971c6e83bf 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
   }
   starts.back() = pos;
 }
-}
+}  // namespace
 
 void RecurrentGradientMachine::calcSequenceStartPositions() {
   std::vector<int> starts(commonSeqInfo_.size() + 1);
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 15e7411b5fde0fa3a532394cf7d0e8477ef052d0..bdae7e623ae0472d4fe5ef3a88fc1e93bbf1e52c 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
     dest[index[i]] = src[i];
   }
 }
-}
+}  // namespace
 
 void GatherAgentLayer::forwardIds(PassType passType) {
   IVectorPtr realId = realLayers_[0]->getOutputLabel();
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 4b92b5d163ad107c0783beae45f8c936112fcccf..d5621412caee843e24a0d0c9b7096402765738c7 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -359,12 +359,11 @@ void Layer::backwardActivation() {
   /* Do error clipping */
   if (config_.error_clipping_threshold() > 0.0f) {
     if (FLAGS_log_error_clipping) {
-      CpuVector outGradVec(0, nullptr);
-      outGradVec.subVecFrom(
-          output_.grad->getData(), 0, output_.grad->getElementCnt());
-      real maxAbsGrad = outGradVec.getAbsMax();
+      VectorPtr outGradVec = Vector::create(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
+      real maxAbsGrad = outGradVec->getAbsMax();
       if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
         LOG(INFO) << " layer=" << config_.name() << " need clipping,"
                   << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
       }
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 7ce17a3207becb176a852a16fca52376009db9ee..4adaaef9838f0d178468af3af142031325bfc11d 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
   for (auto it : gpuAllocator_) {
     delete it;
   }
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad31d13a00645aba6fc153d3d13da987..fac442cca56b81f56a750bd3b1c2c0911e79e468 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,11 @@
 add_subdirectory(detail)
+
+cc_library(memory SRCS memory.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 72d3749ad789eca9a4b10944131171c0cf8dfe5a..b9c3fc31c1523abf3acbd116745bbf1596454aac 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,7 +1,15 @@
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
+
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+
+cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index ebe680f5eea4948339fb8c5584a5b9f5d71c752e..27c1b4033b53b059d38ed88694b20b429cbb4cce 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,317 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
+                "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
+               << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    DLOG(INFO) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
+               << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
+               << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+
+  // Dumping this block into pool
+  DLOG(INFO) << "Inserting free block (" << block << ", "
+             << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  // Clean up if existing too much free memory
+
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+
+size_t BuddyAllocator::Used() { return total_used_; }
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  DLOG(INFO) << "Creating and inserting new block " << p
+             << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+
+  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+  block->split(cache_, size);
+
+  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                 << block->right_buddy(cache_)->total_size(cache_) << ")";
+
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+
+  return block;
+}
+
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    DLOG(INFO) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
 }
 
 }  // namespace detail
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
index 82e6aaedc719966b4074449ce1ef7193c73dc265..4fa3fb0ee5f826d2b084c0ba184c505aee3acc48 100644
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,9 +14,16 @@
 
 #pragma once
 
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <mutex>
+#include <set>
+#include <unordered_map>
 #include <vector>
 
 namespace paddle {
@@ -25,61 +32,80 @@ namespace detail {
 
 class BuddyAllocator {
  public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
   ~BuddyAllocator();
 
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
   void Free(void*);
   size_t Used();
 
+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
  private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;
 
-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);
 
-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();
 
-  SystemAllocator* system_allocator_;
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
 
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);
 
-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-};
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
 
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};
 
 }  // namespace detail
 }  // namespace memory
diff --git a/paddle/memory/detail/memory_block.cc b/paddle/memory/detail/memory_block.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc40993208323f1f5d18103165c8835b5f829613
--- /dev/null
+++ b/paddle/memory/detail/memory_block.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/memory_block.h b/paddle/memory/detail/memory_block.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5168b519f3a3747f34ef2ea7b87d72dce70064d
--- /dev/null
+++ b/paddle/memory/detail/memory_block.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declarations
+class MetadataCache;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.cc b/paddle/memory/detail/meta_cache.cc
new file mode 100644
index 0000000000000000000000000000000000000000..30ff80e7bac0b595fe60aeab0a3c59f4e23eae2d
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.cc
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_cache.h b/paddle/memory/detail/meta_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca0789779e273fb71c3d6282c0a921cda2d776cc
--- /dev/null
+++ b/paddle/memory/detail/meta_cache.h
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.cc b/paddle/memory/detail/meta_data.cc
new file mode 100644
index 0000000000000000000000000000000000000000..70c5c1f439e84ec33cf0507beae33f9cdfa51727
--- /dev/null
+++ b/paddle/memory/detail/meta_data.cc
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/meta_data.h b/paddle/memory/detail/meta_data.h
new file mode 100644
index 0000000000000000000000000000000000000000..628cf1f2e347e288d1bf34c14c7b2f13a28d3662
--- /dev/null
+++ b/paddle/memory/detail/meta_data.h
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
index 50bec926f83dee8a4343d0b16aeb088f9d2a4871..1579174b1a6ff08824629d833d01411cff651f48 100644
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,76 +13,128 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 
 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock
 
 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"
 
 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");
 
 namespace paddle {
 namespace memory {
 namespace detail {
 
-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
   // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
   // malloc might not return nullptr if size is zero, but the returned
   // pointer shall not be dereferenced -- so we make it nullptr.
   if (size <= 0) return nullptr;
 
+  index = 0;  // unlock memory
+
   void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
   }
+
   return p;
 }
 
-void CPUAllocator::Free(void* p, size_t size) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
     munlock(p, size);
   }
   free(p);
 }
 
+bool CPUAllocator::UseGpu() const { return false; }
+
 #ifndef PADDLE_ONLY_CPU
 
-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
   // CUDA documentation doesn't explain if cudaMalloc returns nullptr
   // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;
 
+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t usable = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
   void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= usable) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      gpu_alloc_size_ += size;
+      return p;
+    }
+  }
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
   }
-  return result == cudaSuccess ? p : nullptr;
+
+  return nullptr;
 }
 
-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
   // Purposefully allow cudaErrorCudartUnloading, because
   // that is returned if you ever call cudaFree after the
   // driver has already shutdown. This happens only if the
   // process is terminating, in which case we don't care if
   // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
   if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    platform::throw_on_error(err,
+                             "cudaFree{Host} failed in GPUAllocator::Free.");
   }
 }
 
+bool GPUAllocator::UseGpu() const { return true; }
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace detail
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
index 184b383f7f78244fa6632a3bffb1a0a78b3aa664..82ba322e057575c460b1d51d719c9b0fa459273e 100644
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,31 +20,36 @@ namespace paddle {
 namespace memory {
 namespace detail {
 
-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
  public:
   virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
 };
 
 class CPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
 };
 
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
  public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU
 
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
index 9bd5706a4e4d1546a8c879ebbac0f3349c9d59f6..ba44e06ddb68e92e4086a8006b868557b0c89b50 100644
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
   bool freed = false;
   {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
     if (size > 0) {
       EXPECT_NE(p, nullptr);
     } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
     int* i = static_cast<int*>(p);
     std::shared_ptr<int> ptr(i, [&](void* p) {
       freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
     });
   }
   EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
   paddle::memory::detail::GPUAllocator a;
   TestAllocator(a, 2048);
   TestAllocator(a, 0);
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 0d123d99e234a378ee64850eebacece223e2b121..df3d57d629184d28fd42130df9b020a7b52ade72 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -17,43 +17,67 @@ limitations under the License. */
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
 
-#include <boost/variant.hpp>
-
 namespace paddle {
 namespace memory {
 
-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
   }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return a;
 }
 
-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
 }
 
-size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
   }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  return as[gpu_id];
+}
+
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
+
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
+#endif  // PADDLE_ONLY_CPU
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index a33092bade65e6df0faee226a8967c9fc9caa032..2d6f4fd2a08ee0039647d276476263d0f8d00329 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void* Alloc(Place, size_t);
+
+template <class Place>
+void Free(Place, void*);
+
+template <class Place>
+size_t Used(Place);
 
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53cc63a098d0802479e3a371717adb7596c249ed
--- /dev/null
+++ b/paddle/memory/memory_test.cc
@@ -0,0 +1,138 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f47c3a42083f289d6c99fe6df62e3478e0363e31
--- /dev/null
+++ b/paddle/operators/CMakeLists.txt
@@ -0,0 +1,49 @@
+function(op_library TARGET)
+    # op_library is a function to create op library. The interface is same as
+    # cc_library. But it handle split GPU/CPU code and link some common library
+    # for ops.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(op_common_deps operator op_registry)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    foreach(src ${op_library_SRCS})
+        if (${src} MATCHES ".*\\.cu$")
+            list(APPEND cu_srcs ${src})
+        elseif(${src} MATCHES ".*\\.cc$")
+            list(APPEND cc_srcs ${src})
+        else()
+            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+        endif()
+    endforeach()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+
+    list(LENGTH cu_srcs cu_srcs_len)
+    if (${cu_srcs_len} EQUAL 0)
+        message(WARNING "The op library ${TARGET} not support GPU!")
+    endif()
+
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+endfunction()
+
+op_library(add_op SRCS add_op.cc add_op.cu)
+cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
+
+op_library(mul_op SRCS mul_op.cc mul_op.cu)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..41d044cdb72b5fb2a7f8654e8ad103778e0857d1
--- /dev/null
+++ b/paddle/operators/add_op.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/add_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+class AddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two");
+    PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one");
+    PADDLE_ENFORCE(
+        inputs[0] != nullptr && inputs[1] != nullptr && outputs[0] != nullptr,
+        "Inputs/Outputs of AddOp must all be set");
+    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
+                   "Two input of Add Op's dimension must be same.");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class AddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of add op");
+    AddInput("Y", "The second input of add op");
+    AddOutput("Out", "The output of add op");
+    AddComment(R"DOC(
+Two Element Add Operator.
+
+The equation is: Out = X + Y
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
+typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
+    AddKernel_CPU_float;
+REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0edf142ee4e5f359ea14be02dbf3f7f8855f6db1
--- /dev/null
+++ b/paddle/operators/add_op.cu
@@ -0,0 +1,6 @@
+#include "paddle/operators/add_op.h"
+#include "paddle/framework/op_registry.h"
+
+typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
+REGISTER_OP_GPU_KERNEL(add_two,
+                       AddKernel_GPU_float);
\ No newline at end of file
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..e08b3fb18775e2536a13bc838f40472c5c3e7ff7
--- /dev/null
+++ b/paddle/operators/add_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class AddKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext& context) const override {
+    auto input0 = context.Input(0)->Get<framework::Tensor>();
+    auto input1 = context.Input(1)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    output->flat<T>().device(*(context.GetEigenDevice<Place>())) =
+        input0.flat<T>() + input1.flat<T>();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/add_op_test.cc b/paddle/operators/add_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53b354fedcacf2176aed8b504daf2046bdf96bb6
--- /dev/null
+++ b/paddle/operators/add_op_test.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#define private public
+#include <paddle/framework/op_registry.h>
+USE_OP(add_two);
+TEST(AddOp, GetOpProto) {
+  auto& protos = paddle::framework::OpRegistry::protos();
+  auto it = protos.find("add_two");
+  ASSERT_NE(it, protos.end());
+}
\ No newline at end of file
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..713b2a5dc83d8dd5a3d944101591d75cb19fe04f
--- /dev/null
+++ b/paddle/operators/mul_op.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/tensor.h>
+#include <paddle/operators/mul_op.h>
+
+namespace paddle {
+namespace operators {
+
+class MulOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+                   "The input of mul op must be matrix");
+    PADDLE_ENFORCE(
+        dim0[1] == dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
+    outputs[0]->set_dims({dim0[0], dim1[1]});
+  }
+};
+
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..201723df247993c5cc1650edbe4f74441e3217d4
--- /dev/null
+++ b/paddle/operators/mul_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/operators/mul_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(mul,
+                       paddle::operators::MulKernel<paddle::platform
+                       ::GPUPlace>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..ce8a0169e0cbaafb7e90d2227c9597fff463883d
--- /dev/null
+++ b/paddle/operators/mul_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class MulKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..414bafd0468033813d50d4d6723e68ee9347eaac
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cc
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+namespace paddle {
+namespace operators {
+
+class RowWiseAddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(rowwise_add,
+            paddle::operators::RowWiseAddOp,
+            paddle::operators::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2c4bfbf93a1064a47a19c991fa6655b5d67e83cb
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.cu
@@ -0,0 +1,6 @@
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..35f43e6376be6239021e7a9bacb849b93d5226b5
--- /dev/null
+++ b/paddle/operators/rowwise_add_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class RowWiseAddKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..45ae277c538ca90716febaf2f3d92b560149d147
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/sigmoid_op.h>
+namespace paddle {
+namespace operators {
+
+class SigmoidOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddInput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(sigmoid,
+            paddle::operators::SigmoidOp,
+            paddle::operators::SigmoidOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..79d5222348f610b1b016a2df06e8b1e0a4fac66c
--- /dev/null
+++ b/paddle/operators/sigmoid_op.cu
@@ -0,0 +1,5 @@
+#include <paddle/operators/sigmoid_op.h>
+#include <paddle/framework/op_registry.h>
+
+REGISTER_OP_GPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..42173343f3e364729ecd190fc554b8c45ecfca8d
--- /dev/null
+++ b/paddle/operators/sigmoid_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class SigmoidKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4ca7be359e210d7a31aef94e498f37a1ad4879a2
--- /dev/null
+++ b/paddle/operators/softmax_op.cc
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+
+namespace paddle {
+namespace operators {
+
+class SoftmaxOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
+
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..903eef1b62231d65e2f9ec7a1f57fca0f4c4605c
--- /dev/null
+++ b/paddle/operators/softmax_op.cu
@@ -0,0 +1,5 @@
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+
+REGISTER_OP_GPU_KERNEL(
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..74e9e2786b11b9a87cd9700d8458d4e611a8d4bb
--- /dev/null
+++ b/paddle/operators/softmax_op.h
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+
+namespace paddle {
+namespace operators {
+
+template <typename Place>
+class SoftmaxKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/optimizer/optimizer.cc b/paddle/optimizer/optimizer.cc
index 54662dc37891d3211950453b210db4b475837df4..eb7125adee769c97e16986cabf06ea389bf4c143 100644
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
@@ -44,8 +44,8 @@ paddle_optimizer* paddle_create_optimizer(const unsigned char* config_proto,
                                           const int state_len) {
   paddle_optimizer* optimizer = new paddle_optimizer;
   std::string config(config_proto, config_proto + config_proto_len);
-  Tensor* parameter =
-      new Tensor(reinterpret_cast<float*>(param_buffer), num_bytes);
+  Tensor* parameter = new Tensor(reinterpret_cast<float*>(param_buffer),
+                                 num_bytes / sizeof(float));
   optimizer->impl = ParameterOptimizer::Create(config, parameter);
   if (state != nullptr) {
     std::string s(state, state + state_len);
@@ -65,7 +65,8 @@ int paddle_update_parameter(paddle_optimizer* o,
                             int num_bytes) {
   // TOOD(zhihong): datatype not work. need to add the runtime datatype
   auto grad_type = reinterpret_cast<const float*>(grad_buffer);
-  Tensor* gradient = new Tensor(const_cast<float*>(grad_type), num_bytes);
+  Tensor* gradient =
+      new Tensor(const_cast<float*>(grad_type), num_bytes / sizeof(float));
   o->impl->Update(gradient);
   return PADDLE_SUCCESS;
 }
diff --git a/paddle/optimizer/parameter_optimizer_test.cpp b/paddle/optimizer/parameter_optimizer_test.cpp
index 4e6254d9e4dab48279b4a880695959526d30d70c..edf4ae37a9beee2911d23dd1ab23e67a18065b1b 100644
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
@@ -1,3 +1,19 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = (float)rand() / (float)RAND_MAX;
   }
   return param;
 }
 
-Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
   for (size_t i = 0; i < p.size(); ++i) {
     p[i] = i;
   }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {
 
 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
   const size_t kSize = 5;
 
   virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
   virtual void TearDown() {}
 
   void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
     config_.mutable_sgd()->set_momentum(0.0);
     config_.mutable_sgd()->set_decay(0.0);
     config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
     config_.mutable_adam()->set_beta_1(0.9);
     config_.mutable_adam()->set_beta_2(0.1);
     config_.mutable_adam()->set_epsilon(1e-3);
     config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
     config_.mutable_const_lr()->set_learning_rate(0.1);
     std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
     opts_.push_back(opt);
   }
 
   void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       int s = 0;
       float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
   }
 
   void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
     for (size_t i = 0; i < opts_.size(); ++i) {
       opts_[i]->Update(g);
     }
@@ -91,8 +107,8 @@ public:
   }
 
 private:
-  std::vector<ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
 };
 
 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
diff --git a/paddle/optimizer/serialization_test.cpp b/paddle/optimizer/serialization_test.cpp
index d2454140dc243b40ed8348578360b30894213838..e4d97cbdba545c4ba5adf5b30efd3fc9f3f744ee 100644
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
@@ -1,19 +1,32 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "serialization.h"
 #include "gtest/gtest.h"
 
-using namespace paddle;
-using namespace paddle::optimizer;
-
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
   for (size_t i = 0; i < t.size(); ++i) {
     t[i] = i;
     t1[i] = 0;
   }
 
-  TensorProto proto;
-  TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
   for (size_t i = 0; i < t1.size(); ++i) {
     EXPECT_EQ(t1[i], t[i]);
   }
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index ebacd5d6dc850ebde5212bba39aea4dde8e4eb03..6ac4035c0f863c5f63d17b523a7a8be668ff3da0 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,18 @@
-add_subdirectory(dynload)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
 
-nv_test(cuda_test SRCS cuda_test.cu)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
+add_subdirectory(dynload)
+
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
+
+cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
diff --git a/paddle/platform/cpu_info.cc b/paddle/platform/cpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..dfab391cfbe1f04bc2a998233f7e7909579ca72b
--- /dev/null
+++ b/paddle/platform/cpu_info.cc
@@ -0,0 +1,67 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info.h b/paddle/platform/cpu_info.h
new file mode 100644
index 0000000000000000000000000000000000000000..8df7c7b4bca5bc88f6ed95d6ab82c81b73918e92
--- /dev/null
+++ b/paddle/platform/cpu_info.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cpu_info_test.cc b/paddle/platform/cpu_info_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8fb195aa7c0a41b7417ff5cf63394046e9c72267
--- /dev/null
+++ b/paddle/platform/cpu_info_test.cc
@@ -0,0 +1,21 @@
+#include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
diff --git a/paddle/platform/cuda_test.cu b/paddle/platform/cuda_test.cu
deleted file mode 100644
index 4067dda2f19f7661722d8a14a27c7b32ed6afc92..0000000000000000000000000000000000000000
--- a/paddle/platform/cuda_test.cu
+++ /dev/null
@@ -1,59 +0,0 @@
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9c1d94e9e703caf2db92ca4a8eac975317e6b945
--- /dev/null
+++ b/paddle/platform/device_context.cc
@@ -0,0 +1,31 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace platform {
+
+template <>
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
+    const {
+  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
+}
+#endif
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index fcef0a5e3058f1c9d54f9e06a54a09286e2454fd..f226a75c20b7a75e5f884cd158d139ebb8b34e47 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,12 +13,14 @@ limitations under the License. */
 
 #include "paddle/framework/enforce.h"
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
+#include <memory>
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
@@ -31,11 +30,29 @@ namespace platform {
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  template <typename DeviceType>
+  DeviceType* get_eigen_device() const;
 };
 
-class CPUDeviceContext : public DeviceContext {};
+class CPUDeviceContext : public DeviceContext {
+ public:
+  CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
+
+  Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
+
+  Place GetPlace() const override {
+    Place retv = CPUPlace();
+    return retv;
+  }
+
+ private:
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
 
 #ifndef PADDLE_ONLY_CPU
+
 class GPUPlaceGuard {
  public:
   explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
@@ -56,8 +73,13 @@ class CUDADeviceContext : public DeviceContext {
     GPUPlaceGuard guard(gpu_place_);
     paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
                                      "cudaStreamCreate failed");
-    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
-    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+    eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
+    eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+  }
+
+  Place GetPlace() const override {
+    Place retv = GPUPlace();
+    return retv;
   }
 
   void Wait() {
@@ -67,7 +89,7 @@ class CUDADeviceContext : public DeviceContext {
 
   cudaStream_t stream() { return stream_; }
 
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+  Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
 
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {
@@ -132,10 +154,8 @@ class CUDADeviceContext : public DeviceContext {
                          rand_generator_) == CURAND_STATUS_SUCCESS,
                      "curandDestroyGenerator failed");
     }
-
-    delete eigen_stream_;
-    delete eigen_device_;
-
+    eigen_stream_.reset();
+    eigen_device_.reset();
     paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
                                      "cudaStreamDestroy failed");
   }
@@ -144,8 +164,8 @@ class CUDADeviceContext : public DeviceContext {
   GPUPlace gpu_place_;
   cudaStream_t stream_;
 
-  Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
+  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
 
   cublasHandle_t blas_handle_{nullptr};
 
@@ -154,6 +174,8 @@ class CUDADeviceContext : public DeviceContext {
   int random_seed_;
   curandGenerator_t rand_generator_{nullptr};
 };
+
 #endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 61be4a307dbf073be7dff4564183240834cc7df6..af2ce17fc2238dda62e9888ebe9426edcd55d2bc 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -15,13 +15,26 @@ limitations under the License. */
 #include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
 
-TEST(CUDADeviceContext, Init) {
+using DEVICE_GPU = Eigen::GpuDevice;
+TEST(Device, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::DeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice* gpu_device =
+        device_context->template get_eigen_device<DEVICE_GPU>();
+    ASSERT_NE(nullptr, gpu_device);
+    delete device_context;
+  }
+}
+
+TEST(Device, CUDADeviceContext) {
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
     paddle::platform::CUDADeviceContext* device_context =
         new paddle::platform::CUDADeviceContext(i);
-    Eigen::GpuDevice gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device.stream());
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
     ASSERT_NE(nullptr, cudnn_handle);
     cublasHandle_t cublas_handle = device_context->cublas_handle();
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index 9f829b70128655f018c59db32b95d3f1789da5fc..d205ead84598e04eea523be32139959a02e0dd83 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3dfdaefb2348346e8f917b1f6c758bf6d91a1a
--- /dev/null
+++ b/paddle/platform/dynload/cublas.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/cublas.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 258cc88031a71e9fee65b5445bd5537d6782e226..c44b7240a885c2ef71e550df645dbaded69f9944 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
 
 /**
  * The following macro definition can generate structs
@@ -34,10 +34,10 @@ void *cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
-    cublasStatus_t operator()(Args... args) {                       \
+    inline cublasStatus_t operator()(Args... args) {                \
       typedef cublasStatus_t (*cublasFunc)(Args...);                \
       std::call_once(cublas_dso_flag,                               \
                      paddle::platform::dynload::GetCublasDsoHandle, \
@@ -45,62 +45,46 @@ void *cublas_dso_handle = nullptr;
       void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
       return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name;  // struct DynLoad__##__name
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    inline template <typename... Args>           \
+    cublasStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
-// include all needed cublas functions in HPPL
-// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
+  __macro(cublasSgemv);                   \
+  __macro(cublasDgemv);                   \
+  __macro(cublasSgemm);                   \
+  __macro(cublasDgemm);                   \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);                   \
+  __macro(cublasCreate_v2);               \
+  __macro(cublasDestroy_v2);              \
+  __macro(cublasSetStream_v2);            \
+  __macro(cublasSetPointerMode_v2);       \
+  __macro(cublasGetPointerMode_v2);       \
+  __macro(cublasSgemmBatched);            \
+  __macro(cublasDgemmBatched);            \
+  __macro(cublasCgemmBatched);            \
+  __macro(cublasZgemmBatched);            \
+  __macro(cublasSgetrfBatched);           \
+  __macro(cublasSgetriBatched);           \
+  __macro(cublasDgetrfBatched);           \
+  __macro(cublasDgetriBatched)
 
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
 
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
-
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
-#endif
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5e15b5efcdae6a1eed09f002eb2f4f2163035f
--- /dev/null
+++ b/paddle/platform/dynload/cudnn.cc
@@ -0,0 +1,28 @@
+#include <paddle/platform/dynload/cudnn.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index 0a9562c573cdfe059ef7caa39ba62efa87225e41..ef0dd85b083dc2335dd5c70d3dc5f59eda25daeb 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -23,12 +23,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
     auto operator()(Args... args) -> decltype(__name(args...)) {   \
@@ -39,17 +39,19 @@ void* cudnn_dso_handle = nullptr;
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
-  } __name; /* struct DynLoad__##__name */
+  };                                                               \
+  extern struct DynLoad__##__name __name
 
 #else
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
   struct DynLoad__##__name {                                     \
     template <typename... Args>                                  \
     auto operator()(Args... args) -> decltype(__name(args...)) { \
       return __name(args...);                                    \
     }                                                            \
-  } __name; /* struct DynLoad__##__name */
+  };                                                             \
+  extern DynLoad__##__name __name
 
 #endif
 
@@ -57,80 +59,73 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
+  __macro(cudnnSetTensor4dDescriptor);              \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
+  __macro(cudnnCreateTensorDescriptor);             \
+  __macro(cudnnDestroyTensorDescriptor);            \
+  __macro(cudnnCreateFilterDescriptor);             \
+  __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnDestroyFilterDescriptor);            \
+  __macro(cudnnCreateConvolutionDescriptor);        \
+  __macro(cudnnCreatePoolingDescriptor);            \
+  __macro(cudnnDestroyPoolingDescriptor);           \
+  __macro(cudnnSetConvolution2dDescriptor);         \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnCreate);                             \
+  __macro(cudnnDestroy);                            \
+  __macro(cudnnSetStream);                          \
+  __macro(cudnnActivationForward);                  \
+  __macro(cudnnConvolutionForward);                 \
+  __macro(cudnnConvolutionBackwardBias);            \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
+  __macro(cudnnTransformTensor);                    \
+  __macro(cudnnPoolingForward);                     \
+  __macro(cudnnPoolingBackward);                    \
+  __macro(cudnnSoftmaxBackward);                    \
+  __macro(cudnnSoftmaxForward);                     \
+  __macro(cudnnGetVersion);                         \
+  __macro(cudnnGetErrorString);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(cudnnAddTensor);                 \
+  __macro(cudnnConvolutionBackwardData);   \
+  __macro(cudnnConvolutionBackwardFilter);
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-
 // APIs available after R4:
 #if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
+  __macro(cudnnBatchNormalizationForwardInference); \
+  __macro(cudnnBatchNormalizationBackward);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 // APIs in R5
 #if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(cudnnCreateActivationDescriptor); \
+  __macro(cudnnSetActivationDescriptor);    \
+  __macro(cudnnGetActivationDescriptor);    \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c1fab992c98569d4a95b6e699d97d428511e48e
--- /dev/null
+++ b/paddle/platform/dynload/curand.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/curand.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}
+}
+}
\ No newline at end of file
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index 9dc0a25c0fbdc3f73f1dd82206e8940972b0b7f5..d8c46bc41e18d013a80cd0a9116a4b1a52bf5854 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
     curandStatus_t operator()(Args... args) {                       \
@@ -36,32 +36,29 @@ void *curand_dso_handle = nullptr;
       void *p_##__name = dlsym(curand_dso_handle, #__name);         \
       return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name; /* struct DynLoad__##__name */
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    template <typename... Args>                  \
+    curandStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)       \
-  __macro(curandDestroyGenerator)
-// clang-format on
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandDestroyGenerator);
 
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/error.h b/paddle/platform/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..93424bb61096503a4843da7942853a113f612e3b
--- /dev/null
+++ b/paddle/platform/error.h
@@ -0,0 +1,87 @@
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+#ifndef PADDLE_ONLY_CPU
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+inline void throw_on_error(curandStatus_t stat, const char* message) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               message);
+  }
+}
+
+inline void throw_on_error(cudnnStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    ss << cudnnGetErrorString(stat);
+    ss << ", " << message;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void throw_on_error(cublasStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    ss << "CUBLAS: not initialized";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    ss << "CUBLAS: alloc failed";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    ss << "CUBLAS: invalid value";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    ss << "CUBLAS: arch mismatch";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    ss << "CUBLAS: mapping error";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    ss << "CUBLAS: execution failed";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    ss << "CUBLAS: internal error";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    ss << "CUBLAS: not supported";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    ss << "CUBLAS: license error";
+  }
+  ss << ", " << message;
+  throw std::runtime_error(ss.str());
+}
+
+inline void throw_on_error(cublasStatus_t stat) {
+  const char* message = "";
+  throw_on_error(stat, message);
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+inline void throw_on_error(int stat, const char* message) {
+  if (stat) {
+    throw std::runtime_error(message + (", stat = " + std::to_string(stat)));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/gpu_info.cc b/paddle/platform/gpu_info.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a1383d3524aedf834c329425419b989d47668bea
--- /dev/null
+++ b/paddle/platform/gpu_info.cc
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetDeviceCount() {
+  int count;
+  throw_on_error(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  throw_on_error(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  throw_on_error(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t& available, size_t& total) {
+  throw_on_error(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cuda.h b/paddle/platform/gpu_info.h
similarity index 55%
rename from paddle/platform/cuda.h
rename to paddle/platform/gpu_info.h
index 5ed36c0f02549e29fc680eba097c9d851fa346e5..79e71956bd32e8c253ac4192a04e5903bed1c94a 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/gpu_info.h
@@ -16,33 +16,31 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
+#include <stddef.h>
 
 namespace paddle {
 namespace platform {
 
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
-
-int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
-
-void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
+//! Get the total number of GPU devices in system.
+int GetDeviceCount();
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t& available, size_t& total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 0704820aa05079401eb56814d689d6e280311edb..b31515e1f028acac885a506ff1c20479407a05e3 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -1,3 +1,17 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/platform/place.h"
 
 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {
 
 class PlacePrinter : public boost::static_visitor<> {
  public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
   void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
   void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..00b14a94321990baef6de35df547eed04b3da04f
--- /dev/null
+++ b/paddle/pybind/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
+        add_op mul_op rowwise_add_op sigmoid_op softmax_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fc9c6544c3cbf5a804b2d052f738bd483d6bf41b
--- /dev/null
+++ b/paddle/pybind/pybind.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <Python.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/scope.h>
+#include <paddle/pybind/tensor_bind.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <fstream>
+#include <vector>
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+USE_OP(add_two);
+USE_OP(softmax);
+USE_OP(mul);
+USE_OP(rowwise_add);
+USE_OP(sigmoid);
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of Paddle Paddle");
+
+  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
+        return paddle::pybind::CastToPyBuffer(self);
+      })
+      .def("get_dims",
+           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_dims",
+           [](pd::Tensor& self, const std::vector<int>& dim) {
+             self.set_dims(pd::make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self) {
+             self.mutable_data<float>(paddle::platform::CPUPlace());
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self) {
+             self.mutable_data<int>(paddle::platform::CPUPlace());
+           })
+      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyTensorSetFromArray<int>);
+
+  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](pd::Variable& var, int val) -> void {
+             *var.GetMutable<int>() = val;
+           })
+      .def("get_int",
+           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](pd::Variable& self) -> pd::Tensor* {
+             return self.GetMutable<pd::Tensor>();
+           },
+           py::return_value_policy::reference);
+
+  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
+      .def(py::init<const std::shared_ptr<pd::Scope>&>())
+      .def("get_var",
+           &pd::Scope::GetVariable,
+           py::return_value_policy::reference)
+      .def("create_var",
+           &pd::Scope::CreateVariable,
+           py::return_value_policy::reference);
+
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<std::string> {
+    auto& protos = pd::OpRegistry::protos();
+    std::vector<std::string> ret_values;
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
+      ret_values.emplace_back();
+      PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+    }
+    return ret_values;
+  });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+
+  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
+      .def("__str__", &pd::OperatorBase::DebugString)
+      .def_static("create", [](const std::string& protobin) {
+        pd::OpDesc desc;
+        PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                       "Cannot parse user input to OpDesc");
+        PADDLE_ENFORCE(desc.IsInitialized(),
+                       "User OpDesc is not initialized, reason %s",
+                       desc.InitializationErrorString());
+        return pd::OpRegistry::CreateOp(desc);
+      });
+
+  return m.ptr();
+}
diff --git a/paddle/pybind/tensor_bind.h b/paddle/pybind/tensor_bind.h
new file mode 100644
index 0000000000000000000000000000000000000000..b96516643ab55b9615ccafdc41d3290590987d95
--- /dev/null
+++ b/paddle/pybind/tensor_bind.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <paddle/framework/tensor.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+
+namespace paddle {
+
+namespace pybind {
+
+namespace details {
+
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
+                   "Only CPU tensor can cast to numpy array");
+
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+
+      return py::buffer_info(
+          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(tensor.dims()),
+          dims_outside,
+          strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+
+template <typename T>
+void PyTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+
+  self.set_dims(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+
+}  // namespace pybind
+}  // namespace paddle
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ab60f1a38dd4cd1d9799c0019dccae5f1c7d4310..3860facb099950a5287d3f6b89c3de38f588f568 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -155,7 +155,8 @@ RUN apt-get update &&\
     paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
-
+ADD go/cmd/pserver/pserver /usr/bin/
+ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/docker/build_android.sh b/paddle/scripts/docker/build_android.sh
index bfa10c91553563bddac8c1b41bf21490fb89d3cf..56d290be4ab04a9f6974023159aa8571d27f8dd5 100644
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 
 set -xe
 
-mkdir -p /paddle/build
-cd /paddle/build
-rm -f /paddle/install 2>/dev/null || true
+mkdir -p /paddle/build_android
+cd /paddle/build_android
+rm -rf /paddle/install 2>/dev/null || true
 cmake -DCMAKE_SYSTEM_NAME=Android \
       -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
       -DANDROID_ABI=armeabi-v7a \
@@ -21,6 +21,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
       ..
 make -j `nproc`
 make install
-
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version
diff --git a/paddle/scripts/travis/build_android.sh b/paddle/scripts/travis/build_android.sh
new file mode 100755
index 0000000000000000000000000000000000000000..004067a8f55351509caaf2bbf6d5c349a4698a79
--- /dev/null
+++ b/paddle/scripts/travis/build_android.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+set -e
+
+ANDROID_STANDALONE_TOOLCHAIN=$HOME/android-toolchain-gcc
+TMP_DIR=$HOME/$JOB/tmp
+mkdir -p $TMP_DIR
+cd $TMP_DIR
+wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip
+unzip -q android-ndk-r14b-linux-x86_64.zip
+chmod +x $TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh
+$TMP_DIR/android-ndk-r14b/build/tools/make-standalone-toolchain.sh --force --arch=arm --platform=android-21 --install-dir=$ANDROID_STANDALONE_TOOLCHAIN
+cd $HOME
+rm -rf $TMP_DIR
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build_android
+cd $TRAVIS_BUILD_DIR/build_android
+
+# Compile paddle binaries
+cmake -DCMAKE_SYSTEM_NAME=Android \
+      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
+      -DANDROID_ABI=armeabi-v7a \
+      -DANDROID_ARM_NEON=ON \
+      -DANDROID_ARM_MODE=ON \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_STYLE_CHECK=OFF \
+      ..
+
+make -j `nproc`
diff --git a/paddle/scripts/travis/check_style.sh b/paddle/scripts/travis/check_style.sh
index 4754bdd4c80de9700d92b0e33ecfdfc582f42813..ec499a839ac6593bac788f4cca5e33afbed73010 100755
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 function abort(){
     echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to reformat your code and git push again." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
     exit 1
 }
 
@@ -13,8 +13,14 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
 
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
+
 if ! pre-commit run -a ; then
-  git diff  --exit-code
+    git diff
+    exit 1
 fi
 
 trap : 0
diff --git a/paddle/string/piece.h b/paddle/string/piece.h
index db7c3e69804a6a8f0510ba376432fe560ae74442..0272529d1c9b2cb6000a26f1d4d80276d06bf27b 100644
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,7 +35,7 @@ public:
 
   // We provide non-explicit singleton constructors so users can
   // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
   // size_ is 0.
   Piece();
   Piece(const char* d, size_t n);
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index f25ce2f7f06f6da0feab27da61b8e49689cbe213..a830ceba5772846cd9255a3eeb26e8d6a17dcfbc 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -22,11 +22,23 @@ DECLARE_string(save_dir);
 namespace paddle {
 NewRemoteParameterUpdater::NewRemoteParameterUpdater(
     const OptimizationConfig &config, const std::string pserverSpec)
-    : parameterClient_(-1),
+    : trainerConfig_(config),
+      parameterClient_(-1),
       newParameters_(nullptr),
       newGradients_(nullptr),
       pserverSpec_(pserverSpec) {}
 
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
 void NewRemoteParameterUpdater::init(
     const std::vector<ParameterPtr> &parameters) {
   ParameterUpdater::init(parameters);
@@ -37,8 +49,13 @@ void NewRemoteParameterUpdater::init(
   }
 
   // create parameter server client.
-  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                               FLAGS_trainer_id == 0);
+  if (useEtcd_) {
+    parameterClient_ = paddle_new_etcd_pserver_client(
+        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }
 
   // init new parameter and gradient.
   newParameters_ = initNewParameter(PARAMETER_VALUE);
@@ -51,7 +68,22 @@ void NewRemoteParameterUpdater::init(
     LOG(INFO) << "paddle_begin_init_params start";
     for (int i = 0; i < parameterSize(); ++i) {
       auto paramConfig = parameters_[i]->getConfig();
-      std::string bytes = paramConfig.SerializeAsString();
+      LOG(INFO) << "old param config: " << paramConfig.DebugString();
+      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
+      OptimizerConfig optimizeConfigV2;
+      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
+      sgdConfigV2->set_momentum(paramConfig.momentum());
+      sgdConfigV2->set_decay(paramConfig.decay_rate());
+      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      auto constlr = optimizeConfigV2.mutable_const_lr();
+      constlr->set_learning_rate(paramConfig.learning_rate());
+      if (trainerConfig_.algorithm() == "sgd") {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+        // FIXME: config all algorithms
+      } else {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      }
+      std::string bytes = optimizeConfigV2.SerializeAsString();
       const char *array = bytes.data();
       int size = (int)bytes.size();
       paddle_init_param(
@@ -83,4 +115,4 @@ void NewRemoteParameterUpdater::finishBatch(real cost) {
 void NewRemoteParameterUpdater::startPass() {}
 
 bool NewRemoteParameterUpdater::finishPass() { return true; }
-}
+}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index f735185f62b3491a63e34cfc4a2ef73dae12243e..6223ba427c9b94494c2bee8f0847442f1b0574c9 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <functional>
 #include <thread>
+#include "OptimizerConfig.pb.h"
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
 #include "paddle/pserver/ParameterClient2.h"
@@ -31,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater {
 public:
   NewRemoteParameterUpdater(const OptimizationConfig& config,
                             const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
   ~NewRemoteParameterUpdater() {
     releaseNewParameter(newParameters_);
     releaseNewParameter(newGradients_);
@@ -101,6 +105,7 @@ private:
   }
 
 protected:
+  const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
   /// the parameters for new pserver client
@@ -109,6 +114,8 @@ protected:
   paddle_parameter** newGradients_;
   /// the specification of parameter server "host1:port,host1:port"
   std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 60ac8459a12db801321da4a9d9c1d48ac8bd6d16..133e2be104c6fbfddefd8698d2b6aa8315c56c70 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -62,11 +62,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
   m->conf = config;
 }
 
-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }
 
 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }
 
diff --git a/paddle/utils/DynamicLoader.h b/paddle/utils/DynamicLoader.h
index 9b5ad21724afd7176f958619e7e10d12dc08fa49..2e5ff76a06152b6a12818f06baaeaa6a69726ba8 100644
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once
 
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
  *
  */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b5e2862546212041a774599ec664b95e56224a07..0a27b8b97b83a9066af23039a317c437ea56777a 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
   ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
   }
   ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
 
@@ -65,7 +65,7 @@ public:
     if (!p && createLocal) {
       p = new T();
       int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
     }
     return p;
   }
@@ -79,7 +79,7 @@ public:
     if (T* q = get(false)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
   }
 
   /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
   ~ThreadLocalD() {
     pthread_key_delete(threadSpecificKey_);
     for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
     T* p = (T*)pthread_getspecific(threadSpecificKey_);
     if (!p) {
       p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
       updateMap(p);
     }
     return p;
@@ -141,7 +141,7 @@ public:
     if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
       dataDestructor(q);
     }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
     updateMap(p);
   }
 
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 13a1802ee3790b1255fc11f5b2053e3342c61914..0171f9d8ccd6045cb876d57684269a2a49e77f96 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,10 +26,17 @@ endif(WITH_GOLANG)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
+
+add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        DEPENDS paddle_pybind)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+
+
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index fe06dd812edec228c56000a306334904b6786237..6e2f21823405c0187a86c363ed47e8d9017e889f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1575,7 +1575,13 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
         super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -1592,6 +1598,8 @@ class FCLayer(LayerBase):
             self.create_input_parameter(input_index, psize, dims, sparse,
                                         format)
         self.create_bias_parameter(bias, self.config.size)
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
 
 
 @config_layer('selective_fc')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 6980a31679b5a34293c3e0304159fc97de1827ec..1f5b9e999c7403b08746920d65f9d482113d54b1 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -127,6 +127,7 @@ __all__ = [
     'dropout_layer',
     'prelu_layer',
     'switch_order_layer',
+    'gated_unit_layer',
 ]
 
 
@@ -5864,7 +5865,7 @@ def prelu_layer(input,
     :rtype: LayerOutput
     """
 
-    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
     assert isinstance(param_attr, ParameterAttribute)
 
     l = Layer(
@@ -5880,6 +5881,99 @@ def prelu_layer(input,
         size=l.config.size)
 
 
+@wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: input for this layer.
+    :type input: LayerOutput
+    :param size: output size of the gated unit.
+    :type size: int
+    :param act: activation type of the projected input.
+    :type act: BaseActivation
+    :param name: name of this layer.
+    :type name: basestring
+    :param gate_attr: Attributes to tune the gate output, for example, error
+        clipping threshold, dropout and so on. See ExtraLayerAttribute for
+        more details.
+    :type gate_attr: ExtraLayerAttribute|None
+    :param gate_param_attr: Attributes to tune the learnable projected matrix
+        parameter of the gate.
+    :type gate_param_attr: ParameterAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param inproj_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_attr: ExtraLayerAttribute|None
+    :param inproj_param_attr: Attributes to tune the learnable parameter of
+        the projection of input.
+    :type inproj_param_attr: ParameterAttribute|None
+    :param inproj_bias_attr: Attributes to tune the learnable bias of
+        projection of the input.
+    :type inproj_bias_attr: ParameterAttribute|None
+    :param layer_attr: Attributes to tune the final output of the gated unit,
+        for example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        layer_attr=inproj_attr,
+        param_attr=inproj_param_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        layer_attr=gate_attr,
+        param_attr=gate_param_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
+
+
 @layer_support()
 @wrap_name_default('switch_order')
 def switch_order_layer(input, name=None, reshape=None, layer_attr=None):
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index f0b6625dc3736852980c889232d17853290863ac..810bea913ec79b2df0eb63ed5a4fd411549ff2e9 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1408,6 +1408,8 @@ def outputs(layers, *args):
     :return:
     """
 
+    traveled = set()
+
     def __dfs_travel__(layer,
                        predicate=lambda x: x.layer_type == LayerType.DATA):
         """
@@ -1419,6 +1421,11 @@ def outputs(layers, *args):
         :type layer: LayerOutput
         :return:
         """
+        if layer in traveled:
+            return []
+        else:
+            traveled.add(layer)
+
         assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
         retv = []
         if layer.parents is not None:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a939c41ad01922e421f7bcd93851df7447a6799f..cdf9b2eab733adb173cf33cd6a93ef7b5abefc50 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,7 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
+test_recursive_topology test_gated_unit_layer)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index b7d74f85ab4ca3f434dfa45516dfee7227b6ceee..96fb1d4ebde08b1bca2ffd09e8db0895842cbfd3 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -131,6 +131,7 @@ input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
+output_layer_names: "__nce_layer_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -154,6 +155,7 @@ sub_models {
   input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
   output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f1e4d894a5fb0040f48bdb5a751c3f0d956c23bb
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
@@ -0,0 +1,106 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8133aa9c8d3e7c6843d1b27b70e87d394a1e0e47
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
@@ -0,0 +1,497 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__addto_1__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+}
+layers {
+  name: "__addto_2__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+}
+layers {
+  name: "__addto_3__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+}
+layers {
+  name: "__addto_4__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+}
+layers {
+  name: "__addto_5__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+}
+layers {
+  name: "__addto_6__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+}
+layers {
+  name: "__addto_7__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+}
+layers {
+  name: "__addto_8__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+}
+layers {
+  name: "__addto_9__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+}
+layers {
+  name: "__addto_10__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+}
+layers {
+  name: "__addto_11__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+}
+layers {
+  name: "__addto_12__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+}
+layers {
+  name: "__addto_13__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+}
+layers {
+  name: "__addto_14__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+}
+layers {
+  name: "__addto_15__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+}
+layers {
+  name: "__addto_16__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+}
+layers {
+  name: "__addto_17__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+}
+layers {
+  name: "__addto_18__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+}
+layers {
+  name: "__addto_19__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+}
+layers {
+  name: "__addto_20__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+}
+layers {
+  name: "__addto_21__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+}
+layers {
+  name: "__addto_22__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+}
+layers {
+  name: "__addto_23__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+}
+layers {
+  name: "__addto_24__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+}
+layers {
+  name: "__addto_25__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+}
+layers {
+  name: "__addto_26__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+}
+layers {
+  name: "__addto_27__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+}
+layers {
+  name: "__addto_28__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+}
+layers {
+  name: "__addto_29__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+}
+layers {
+  name: "__addto_30__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+}
+layers {
+  name: "__addto_31__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 32
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__addto_31__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3200
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 32
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 32
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 320
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__addto_0__"
+  layer_names: "__addto_1__"
+  layer_names: "__addto_2__"
+  layer_names: "__addto_3__"
+  layer_names: "__addto_4__"
+  layer_names: "__addto_5__"
+  layer_names: "__addto_6__"
+  layer_names: "__addto_7__"
+  layer_names: "__addto_8__"
+  layer_names: "__addto_9__"
+  layer_names: "__addto_10__"
+  layer_names: "__addto_11__"
+  layer_names: "__addto_12__"
+  layer_names: "__addto_13__"
+  layer_names: "__addto_14__"
+  layer_names: "__addto_15__"
+  layer_names: "__addto_16__"
+  layer_names: "__addto_17__"
+  layer_names: "__addto_18__"
+  layer_names: "__addto_19__"
+  layer_names: "__addto_20__"
+  layer_names: "__addto_21__"
+  layer_names: "__addto_22__"
+  layer_names: "__addto_23__"
+  layer_names: "__addto_24__"
+  layer_names: "__addto_25__"
+  layer_names: "__addto_26__"
+  layer_names: "__addto_27__"
+  layer_names: "__addto_28__"
+  layer_names: "__addto_29__"
+  layer_names: "__addto_30__"
+  layer_names: "__addto_31__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dab45519c65b0ca686558ec7fe2064bb9ad8824
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a693f8dff06dec6e71eeb488da9c807c35e4c9b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din = data_layer(name='data', size=100)
+
+enc = din
+for i in range(32):
+    enc = addto_layer([enc, enc])
+
+pred = fc_layer(
+    input=fc_layer(
+        input=enc, size=32, act=ReluActivation()),
+    size=10,
+    act=SoftmaxActivation())
+outputs(pred)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 3ba5c31871807027e452df5d889b3b403e1c6414..3c75ca4c3abf1e94fc00b87f3af51d1cbf6dc430 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,7 +30,6 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
@@ -47,7 +45,6 @@ __all__ = [
     'data_type',
     'attr',
     'pooling',
-    'data_feeder',
     'dataset',
     'reader',
     'topology',
@@ -61,6 +58,7 @@ __all__ = [
 
 
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
     args = []
     args_dict = {}
     # NOTE: append arguments if they are in ENV
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
index 2698251b9e15046eb14f71c3f5b0546ecbb4a5dd..98dfb85a0ea57050bf8dd8d46fca9574801d8eb3 100644
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 2e4beb6882789249db09705f3f4d6c5c19e492cd..90830515c1e8e6f5260cfca631e02a3a52cedbe5 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -26,8 +26,9 @@ import sentiment
 import wmt14
 import mq2007
 import flowers
+import voc2012
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007', 'flowers'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc2012'
 ]
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 4a2eb59c340f5d0d3818170e56d730330e0bab29..645f3cc0dce70752c20569523e4bab440861f6a1 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -22,6 +22,8 @@ import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
+import random
 
 __all__ = [
     'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -170,8 +172,6 @@ def convert(output_path,
             name_prefix,
             max_lines_to_shuffle=1000):
     import recordio
-    import cPickle as pickle
-    import random
     """
     Convert data from reader to recordio format files.
 
@@ -201,8 +201,10 @@ def convert(output_path,
     def write_data(w, lines):
         random.shuffle(lines)
         for i, d in enumerate(lines):
-            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
-            w[i % num_shards].write(d)
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            o = pickle.dumps(d)
+            w[i % num_shards].write(o)
 
     w = open_writers()
     lines = []
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index fd71b341662ca6f540ce44a86348e782561a97d7..cffb319ad8f56ccddba3fef63e1b6ec68e5bac1e 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"):
         for j in range(i + 1, len(querylist)):
             query_right = querylist[j]
             if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_left.feature_vector),
                     np.array(query_right.feature_vector)
                 ])
             elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                 docpairs.append([
                     np.array(query_right.feature_vector),
                     np.array(query_left.feature_vector)
                 ])
     for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]
 
 
 def gen_list(querylist):
diff --git a/python/paddle/v2/dataset/tests/voc2012_test.py b/python/paddle/v2/dataset/tests/voc2012_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..31e72ebf5eac0508d12783f9ceaa6eef0fa6d353
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.voc2012
+import unittest
+
+
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/voc2012.py b/python/paddle/v2/dataset/voc2012.py
new file mode 100644
index 0000000000000000000000000000000000000000..617e212d67fbe37f9d9663e9c83c62045411fa77
--- /dev/null
+++ b/python/paddle/v2/dataset/voc2012.py
@@ -0,0 +1,85 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Image dataset for segmentation.
+The 2012 dataset contains images from 2008-2011 for which additional
+segmentations have been prepared. As in previous years the assignment
+to training/test sets has been maintained. The total number of images
+with segmentation has been increased from 7,062 to 9,993.
+"""
+
+import tarfile
+import io
+import numpy as np
+from paddle.v2.dataset.common import download
+from paddle.v2.image import *
+from PIL import Image
+
+__all__ = ['train', 'test', 'val']
+
+VOC_URL = 'http://host.robots.ox.ac.uk/pascal/VOC/voc2012/\
+VOCtrainval_11-May-2012.tar'
+
+VOC_MD5 = '6cd6e144f989b92b3379bac3b3de84fd'
+SET_FILE = 'VOCdevkit/VOC2012/ImageSets/Segmentation/{}.txt'
+DATA_FILE = 'VOCdevkit/VOC2012/JPEGImages/{}.jpg'
+LABEL_FILE = 'VOCdevkit/VOC2012/SegmentationClass/{}.png'
+
+CACHE_DIR = 'voc2012'
+
+
+def reader_creator(filename, sub_name):
+
+    tarobject = tarfile.open(filename)
+    name2mem = {}
+    for ele in tarobject.getmembers():
+        name2mem[ele.name] = ele
+
+    def reader():
+        set_file = SET_FILE.format(sub_name)
+        sets = tarobject.extractfile(name2mem[set_file])
+        for line in sets:
+            line = line.strip()
+            data_file = DATA_FILE.format(line)
+            label_file = LABEL_FILE.format(line)
+            data = tarobject.extractfile(name2mem[data_file]).read()
+            label = tarobject.extractfile(name2mem[label_file]).read()
+            data = Image.open(io.BytesIO(data))
+            label = Image.open(io.BytesIO(label))
+            data = np.array(data)
+            label = np.array(label)
+            yield data, label
+
+    return reader
+
+
+def train():
+    """
+    Create a train dataset reader containing 2913 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
+
+
+def test():
+    """
+    Create a test dataset reader containing 1464 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
+
+
+def val():
+    """
+    Create a val dataset reader containing 1449 images in HWC order.
+    """
+    return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'val')
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
index fd6050fa339d280ad54e40128ea6bae25132c873..7589cc9917f26375d595e200245d5ba099bc38d7 100644
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -9,8 +9,6 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
     'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
 ]
@@ -18,6 +16,7 @@ __all__ = [
 
 class WithMetric(object):
     def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
         if not isinstance(evaluator, api.Evaluator):
             raise TypeError("Evaluator should be api.Evaluator type")
         self.__evaluator__ = evaluator
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..c2a7ae7692b08762ffbc91726be7bfa90e8ddedb
--- /dev/null
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -0,0 +1,246 @@
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import cStringIO
+
+
+def get_all_op_protos():
+    """
+    Get all registered op proto from Paddle C++
+    :return: list of OpProto
+    """
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
+
+
+class OpDescCreationMethod(object):
+    """
+    A Functor object to convert user input(use key word args) to OpDesc based on
+    OpProto.
+    
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, op_proto_pb2.OpProto):
+            raise TypeError("Argument should be OpProto")
+        self.__op_proto__ = op_proto
+
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user input to OpDesc. Only key-word args are supported. 
+        :return: OpDesc based on user input
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments is supported by Paddle")
+        op_desc = op_desc_pb2.OpDesc()
+
+        # Inputs
+        ipts, ipt_format, _ = OpDescCreationMethod.extract_input_or_output(
+            "input", kwargs, self.__op_proto__.inputs)
+        op_desc.inputs.extend(ipts)
+        if ipt_format is not None:
+            op_desc.attrs.extend([ipt_format])
+
+        # Outputs
+        outs, out_format, tmp_index = OpDescCreationMethod.extract_input_or_output(
+            "output", kwargs, self.__op_proto__.outputs)
+        op_desc.outputs.extend(outs)
+        if out_format is not None:
+            op_desc.attrs.extend([out_format])
+        if len(tmp_index) != 0:
+            tmp_index_attr = op_desc.attrs.add()
+            tmp_index_attr.type = attr_type_pb2.INTS
+            tmp_index_attr.name = "temporary_index"
+            tmp_index_attr.ints.extend(tmp_index)
+
+        # Types
+        op_desc.type = self.__op_proto__.type
+
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == attr_type_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == attr_type_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == attr_type_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == attr_type_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                else:
+                    raise NotImplementedError("Not support attribute type " +
+                                              attr.type)
+
+        return op_desc
+
+    @staticmethod
+    def extract_input_or_output(in_out, kwargs, meta):
+        """
+        Extract input variable names or output variable names from key-word 
+        arguments, which base on VarProtos.
+        
+        :param in_out: "input" or "output"
+        :param kwargs: key-word arguments that user inputted.
+        :param meta: a list of VarProto
+        :return: The three object will be return. The variable names. The 
+        input_format or output_format attribute(None if the input or output is 
+        not multiple). The temporary variable index list.
+        """
+        multiple = OpDescCreationMethod.any_is_true((m.multiple for m in meta))
+        tmp_index = []
+        retv = []
+        if multiple:
+            var_format = op_desc_pb2.AttrDesc()
+            var_format.type = attr_type_pb2.INTS
+            var_format.name = "%s_format" % in_out
+            var_format.ints.append(0)
+
+            for var in meta:
+                var_name = var.name
+
+                if var.temporary:
+                    var_name = [core.var_names.temp()]
+                    tmp_index.append(len(retv))
+                else:
+                    var_name = kwargs.get(var_name, [])
+                if not isinstance(var_name, list):
+                    var_name = [var_name]
+                retv.extend(var_name)
+                var_format.ints.append(len(var_name) + var_format.ints[-1])
+            return retv, var_format, tmp_index
+        else:
+            for var in meta:
+                if var.temporary:
+                    retv.append(kwargs.get(var.name, core.var_names.temp()))
+                    tmp_index.append(len(retv))
+                else:
+                    retv.append(kwargs.get(var.name, core.var_names.empty()))
+            return retv, None, tmp_index
+
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a bool array to one. If any of them is True, then return True.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+
+
+def get_docstring_from_op_proto(op_proto):
+    """
+    Generate docstring from a OpProto
+    :param op_proto: a OpProto instance.
+    :type op_proto: op_proto_pb2.OpProto
+    :return: docstring
+    """
+    if not isinstance(op_proto, op_proto_pb2.OpProto):
+        raise TypeError("Input must be OpProto")
+    f = cStringIO.StringIO()
+    f.write(op_proto.comment)
+    f.write("\n")
+
+    def __append_param__(name, comment, type):
+        # Maybe replace the following line with template engine is better.
+        f.write(":param ")
+        f.write(name)
+        f.write(": ")
+        f.write(comment)
+        f.write("\n")
+        f.write(":type ")
+        f.write(name)
+        f.write(": ")
+        f.write(type)
+        f.write("\n")
+
+    for ipt in op_proto.inputs:
+        __append_param__(ipt.name, ipt.comment, "list | basestr"
+                         if ipt.multiple else "basestr")
+
+    temp_var_prefix = \
+        "This is a temporary variable. It does not have to set by user. "
+    for opt in op_proto.outputs:
+        __append_param__(opt.name, opt.comment if not opt.temporary else
+                         temp_var_prefix + opt.comment, "list | basestr"
+                         if opt.multiple else "basestr")
+
+    for attr in op_proto.attrs:
+        attr_type = None
+        if attr.type == attr_type_pb2.INT:
+            attr_type = "int"
+        elif attr.type == attr_type_pb2.FLOAT:
+            attr_type = "float"
+        elif attr.type == attr_type_pb2.STRING:
+            attr_type = "basestr"
+        elif attr.type == attr_type_pb2.INTS:
+            attr_type = "list of int"
+        elif attr.type == attr_type_pb2.FLOATS:
+            attr_type = "list of float"
+        elif attr.type == attr_type_pb2.STRINGS:
+            attr_type = "list of basestr"
+
+        if attr_type is None:
+            raise RuntimeError("Not supported attribute type " + attr.type)
+
+        __append_param__(attr.name, attr.comment, attr_type)
+
+    return f.getvalue()
+
+
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto
+    """
+    method = OpDescCreationMethod(op_proto)
+
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+
+    __impl__.__doc__ = get_docstring_from_op_proto(op_proto)
+    return __impl__
+
+
+class OpCreationsHolder(object):
+    """
+    A object will holds all op creation methods.
+    
+    Use `op_creations.xxx_op` to access them.
+    """
+    pass
+
+
+op_creations = OpCreationsHolder()
+
+
+def __bootstrap__():
+    """
+    Bootstrap function for this module. It will dynamic create all op creation
+    methods in runtime.
+    """
+    for op_proto in get_all_op_protos():
+        func = create_op_creation_method(op_proto)
+        func.__name__ = str(op_proto.type)
+        setattr(op_creations, func.__name__, func)
+
+
+__bootstrap__()
diff --git a/python/paddle/v2/framework/default_scope_funcs.py b/python/paddle/v2/framework/default_scope_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e772326c94b7ee44906c71f13e9420e078a1917
--- /dev/null
+++ b/python/paddle/v2/framework/default_scope_funcs.py
@@ -0,0 +1,83 @@
+"""
+Default scope function.
+
+`Paddle` manages Scope as programming language's scope.  It just a 
+thread-local stack of Scope. Top of that stack is current scope, the bottom 
+of that stack is all scopes' parent. 
+
+Invoking `create_var/get_var`  can `create/get` variable in current scope. 
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
+scope. 
+
+A `scoped_function` will take a `function` as input. That function will be 
+invoked in a new local scope. 
+"""
+
+import paddle.v2.framework.core
+import threading
+
+__tl_scope__ = threading.local()
+
+__all__ = [
+    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'create_var',
+    'get_var', 'scoped_function'
+]
+
+
+def get_cur_scope():
+    """
+    Get current scope.
+    :rtype: paddle.v2.framework.core.Scope
+    """
+    cur_scope_stack = getattr(__tl_scope__, 'cur_scope', None)
+    if cur_scope_stack is None:
+        __tl_scope__.cur_scope = list()
+    if len(__tl_scope__.cur_scope) == 0:
+        __tl_scope__.cur_scope.append(paddle.v2.framework.core.Scope(None))
+    return __tl_scope__.cur_scope[-1]
+
+
+def enter_local_scope():
+    """
+    Enter a new local scope
+    """
+    cur_scope = get_cur_scope()
+    new_scope = paddle.v2.framework.core.Scope(cur_scope)
+    __tl_scope__.cur_scope.append(new_scope)
+
+
+def leave_local_scope():
+    """
+    Leave local scope
+    """
+    __tl_scope__.cur_scope.pop()
+
+
+def create_var(name):
+    """
+    create variable in current scope.
+    """
+    return get_cur_scope().create_var(name)
+
+
+def get_var(name):
+    """
+    get variable in current scope.
+    """
+    return get_cur_scope().get_var(name)
+
+
+def scoped_function(func):
+    """
+    invoke `func` in new scope.
+    
+    :param func: a callable function that will be run in new scope.
+    :type func: callable
+    """
+    enter_local_scope()
+    try:
+        func()
+    except:
+        raise
+    finally:
+        leave_local_scope()
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 8cb0c5c3765a00b45177117925e320e61a1b609a..4ce2bef6fcc4b8ddf5a6de3809a1891bce590aab 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1 +1,3 @@
-add_python_test(test_framework test_protobuf.py)
+add_python_test(test_framework test_protobuf.py test_scope.py
+    test_default_scope_funcs.py test_op_creation_methods.py
+    test_tensor.py)
diff --git a/python/paddle/v2/framework/tests/test_default_scope_funcs.py b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
new file mode 100644
index 0000000000000000000000000000000000000000..81033deb1546c81e2566ec5474f45dc56781644a
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
@@ -0,0 +1,33 @@
+from paddle.v2.framework.default_scope_funcs import *
+import unittest
+
+
+class TestDefaultScopeFuncs(unittest.TestCase):
+    def test_cur_scope(self):
+        self.assertIsNotNone(get_cur_scope())
+
+    def test_none_variable(self):
+        self.assertIsNone(get_var("test"))
+
+    def test_create_var_get_var(self):
+        var_a = create_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        enter_local_scope()
+        self.assertIsNotNone(get_cur_scope().get_var('var_a'))
+        leave_local_scope()
+
+    def test_var_get_int(self):
+        def __new_scope__():
+            i = create_var("var_i")
+            self.assertFalse(i.is_int())
+            i.set_int(10)
+            self.assertTrue(i.is_int())
+            self.assertEqual(10, i.get_int())
+
+        for _ in xrange(10):
+            scoped_function(__new_scope__)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_op_creation_methods.py b/python/paddle/v2/framework/tests/test_op_creation_methods.py
new file mode 100644
index 0000000000000000000000000000000000000000..41db7c0d535aa920b34d6cc346090a8c15bfb110
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
@@ -0,0 +1,254 @@
+import unittest
+import paddle.v2.framework.create_op_creation_methods as creation
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+
+
+class TestGetAllProtos(unittest.TestCase):
+    def test_all(self):
+        all_protos = creation.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+
+        ipt = op.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+
+        opt = op.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+
+        op.comment = "not matter"
+
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+        output = method(X="a", Y="b", Z="c")
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(["a", "b"])
+        expected.outputs.append("c")
+        self.assertEqual(expected, output)
+
+    def test_multiple_input_plain_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "fc"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.multiple = True
+
+        ipt = op.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+
+        out = op.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = op_desc_pb2.OpDesc()
+        expected1.inputs.extend(['x', 'w', 'b'])
+        expected1.outputs.extend(['y'])
+        expected1.type = 'fc'
+        attr = expected1.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3])
+        self.assertEqual(expected1, generated1)
+
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = op_desc_pb2.OpDesc()
+        expected2.inputs.extend(['x1', 'x2', 'x3', 'w1', 'w2', 'w3', 'b'])
+        expected2.outputs.extend(['y'])
+        expected2.type = 'fc'
+        attr = expected2.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 3, 6, 7])
+        self.assertEqual(expected2, generated2)
+
+    def test_attrs(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+
+        def __add_attr__(name, type):
+            attr = op.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+
+        __add_attr__("int_attr", attr_type_pb2.INT)
+        __add_attr__("float_attr", attr_type_pb2.FLOAT)
+        __add_attr__("string_attr", attr_type_pb2.STRING)
+        __add_attr__("ints_attr", attr_type_pb2.INTS)
+        __add_attr__("floats_attr", attr_type_pb2.FLOATS)
+        __add_attr__("strings_attr", attr_type_pb2.STRINGS)
+
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+
+        method = creation.OpDescCreationMethod(op)
+
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(['a'])
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = attr_type_pb2.INT
+        attr.i = 10
+
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = attr_type_pb2.FLOAT
+        attr.f = 3.2
+
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = attr_type_pb2.STRING
+        attr.s = "test_str"
+
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = attr_type_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = attr_type_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+
+        self.assertEqual(expected, generated)
+
+    def test_input_temporary_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        out = op.outputs.add()
+        out.name = "OUT"
+        out.comment = ""
+
+        out = op.outputs.add()
+        out.name = "TMP"
+        out.comment = ""
+        out.temporary = True
+
+        out = op.outputs.add()
+        out.name = "OUT2"
+        out.comment = ""
+        op.comment = ""
+
+        method = creation.OpDescCreationMethod(op)
+        generated = method(OUT="a", OUT2="b")
+        desc = op_desc_pb2.OpDesc()
+        desc.outputs.extend(["a", core.var_names.temp(), "b"])
+        desc.type = "test"
+        attr = desc.attrs.add()
+        attr.name = "temporary_index"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.append(2)
+        self.assertEqual(generated, desc)
+
+
+class TestOpCreationDocStr(unittest.TestCase):
+    def test_all(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        op.comment = """Test Op.
+
+This op is used for unit test, not a real op.
+"""
+        a = op.inputs.add()
+        a.name = "a"
+        a.comment = "Input a for test op"
+        a.multiple = True
+
+        b = op.inputs.add()
+        b.name = "b"
+        b.comment = "Input b for test op"
+        self.assertTrue(op.IsInitialized())
+
+        o1 = op.outputs.add()
+        o1.name = "output"
+        o1.comment = "The output of test op"
+
+        o2 = op.outputs.add()
+        o2.name = "temp output"
+        o2.comment = "The temporary output of test op"
+        o2.temporary = True
+
+        test_str = op.attrs.add()
+        test_str.name = "str_attr"
+        test_str.type = attr_type_pb2.STRING
+        test_str.comment = "A string attribute for test op"
+
+        actual = creation.get_docstring_from_op_proto(op)
+        expected_docstring = '''Test Op.
+
+This op is used for unit test, not a real op.
+
+:param a: Input a for test op
+:type a: list | basestr
+:param b: Input b for test op
+:type b: basestr
+:param output: The output of test op
+:type output: basestr
+:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op
+:type temp output: basestr
+:param str_attr: A string attribute for test op
+:type str_attr: basestr
+'''
+        self.assertEqual(expected_docstring, actual)
+
+
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = creation.op_creations.add_two(X="a", Y="b", Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).',
+                         str(add_op))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py
index f0e60191991e2f24b0a1972afb0f6cbd3aaa4008..b8702477e64203e735bff05b115eafbb2a52172d 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -24,3 +24,7 @@ class TestFrameworkProto(unittest.TestCase):
         attr.type = attr_type_lib.FLOAT
         op_proto.type = "cos"
         self.assertTrue(op_proto.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0ee45cfc75e486c693a00d92a97ac0970195581
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -0,0 +1,37 @@
+import paddle.v2.framework.core
+import unittest
+
+
+class TestScope(unittest.TestCase):
+    def test_create_destroy(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNotNone(scope)
+        scope_with_parent = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope_with_parent)
+
+    def test_none_variable(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNone(scope.get_var("test"))
+
+    def test_create_var_get_var(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var_a = scope.create_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(scope.get_var('var_a'))
+        scope2 = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope2.get_var('var_a'))
+
+    def test_var_get_int(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var = scope.create_var("test_int")
+        var.set_int(10)
+        self.assertTrue(var.is_int())
+        self.assertEqual(10, var.get_int())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_tensor.py b/python/paddle/v2/framework/tests/test_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..b72aff3b9cd16595c7e81856642196b2bb61a790
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_tensor.py
@@ -0,0 +1,45 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+
+
+class TestScope(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int()
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1.0, tensor_array_2[3, 9])
+        self.assertEqual(2.0, tensor_array_2[19, 11])
+
+    def test_float_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float()
+
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array)
+
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 34b7308601390a4ccb0c19ef10d2c7a60b3fa576..40134a3270c3579fd2f6a891af66ff241050f60c 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -1,9 +1,7 @@
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder
 
 __all__ = ['infer', 'Inference']
 
@@ -28,6 +26,7 @@ class Inference(object):
     """
 
     def __init__(self, output_layer, parameters):
+        import py_paddle.swig_paddle as api
         topo = topology.Topology(output_layer)
         gm = api.GradientMachine.createFromConfigProto(
             topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
@@ -40,6 +39,7 @@ class Inference(object):
         self.__data_types__ = topo.data_type()
 
     def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         batch_size = len(input)
 
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 70f9e43c9683033233d48a750668771a4c7ba045..4c041fb509903008a7a5648a112b2472ed856aea 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -10,8 +10,9 @@ class client(object):
     client is a client to the master server.
     """
 
-    def __init__(self, addr, buf_size):
-        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def __init__(self, etcd_endpoints, timeout, buf_size):
+        self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout,
+                                                   buf_size)
 
     def close(self):
         lib.paddle_release_master_client(self.c)
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 8124e219ba499333ecdf4b34ff5352e281aaa016..ba581980334fec6226a537af2cf53b3465d32c1e 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,5 +1,3 @@
-import py_paddle.swig_paddle as swig_api
-
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -18,6 +16,7 @@ __all__ = [
 
 class Optimizer(object):
     def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
         if 'batch_size' in kwargs:
             del kwargs['batch_size']  # not important for python library.
 
@@ -36,23 +35,27 @@ class Optimizer(object):
         For each optimizer(SGD, Adam), GradientMachine should enable different
         buffers.
         """
+        import py_paddle.swig_paddle as swig_api
         tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
         assert isinstance(tmp, swig_api.ParameterOptimizer)
         return tmp.getParameterTypes()
 
     def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
 
     def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def __create_new_remote_updater__(self, pserver_spec):
+    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
         return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec)
+            self.__opt_conf__, pserver_spec, use_etcd)
 
     def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec):
+                       pserver_spec, use_etcd):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -66,6 +69,8 @@ class Optimizer(object):
             if use_sparse_remote_updater:
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
+
+        :param pserver_spec: pserver location, eg: localhost:3000
         :return: parameter_updater
         """
         if is_local:
@@ -76,7 +81,7 @@ class Optimizer(object):
                     num_passes, use_sparse_updater)
             else:
                 parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec)
+                    pserver_spec, use_etcd)
         return parameter_updater
 
 
@@ -266,6 +271,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization
 
 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
     swig_api.initPaddle('--use_gpu=false')
     for opt in [
             Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index bbaf8bfa979fbbf460561ebf7077b75b9c41a11a..a9cba8ca0b1efd4149463f6c7bf2dcdfbea350c9 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,5 +1,4 @@
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
 import struct
@@ -124,6 +123,7 @@ class Parameters(object):
         :return: parameter value
         :rtype: np.ndarray
         """
+        import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
         if len(self.__gradient_machines__) == 0:
@@ -223,7 +223,7 @@ class Parameters(object):
         :type gradient_machine: api.GradientMachine
         :return:
         """
-
+        import py_paddle.swig_paddle as api
         if not isinstance(gradient_machine, api.GradientMachine):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
@@ -359,6 +359,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
     :return:
     :rtype: api.Parameter
     """
+    import py_paddle.swig_paddle as api
     param = __get_parameter_in_gradient_machine__(gradient_machine, name)
     vec = param.getBuf(api.PARAMETER_VALUE)
     assert isinstance(vec, api.Vector)
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index f9658a8c5df9562073c8a187074a6cb3459ac5d9..76bae0bb12b6c33f88530386f9cc19ae9b59f457 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,12 +2,6 @@
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -41,6 +35,7 @@ class SGD(object):
     :type parameters: paddle.v2.parameters.Parameters
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
+    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
     """
 
@@ -50,7 +45,8 @@ class SGD(object):
                  update_equation,
                  extra_layers=None,
                  is_local=True,
-                 pserver_spec=None):
+                 pserver_spec=None,
+                 use_etcd=True):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -58,6 +54,7 @@ class SGD(object):
         if not isinstance(update_equation, v2_optimizer.Optimizer):
             raise TypeError("update equation parameter must be "
                             "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
         topology = Topology(cost, extra_layers=extra_layers)
         self.__optimizer__ = update_equation
         self.__topology__ = topology
@@ -65,6 +62,7 @@ class SGD(object):
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
         self.__pserver_spec__ = pserver_spec
+        self.__use_etcd__ = use_etcd
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -123,13 +121,15 @@ class SGD(object):
         :type feeding: dict|list
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         if event_handler is None:
             event_handler = default_event_handler
         __check_train_args__(**locals())
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
             self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__)
+            self.__pserver_spec__, self.__use_etcd__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()
@@ -186,6 +186,8 @@ class SGD(object):
         :type feeding: dict
         :return:
         """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
         feeder = DataFeeder(self.__data_types__, feeding)
         evaluator = self.__gradient_machine__.makeEvaluator()
         out_args = api.Arguments.createArguments(0)
diff --git a/python/setup.py.in b/python/setup.py.in
index a422b3832f4c9c60bc5406277f9ada7032f85f51..65a26940d4d703ea4fbb5022523a90716982ec10 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -19,7 +19,9 @@ setup_requires=["requests",
                 "recordio",
                 "matplotlib",
                 "rarfile",
-                "scipy>=0.19.0"]
+                "scipy>=0.19.0",
+                "Pillow",
+                "nltk"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]
@@ -29,7 +31,9 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'],
+            'paddle.v2.framework': ['core.so']
+      },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.