Merge branch 'backward' of https://github.com/Canpio/Paddle into backward

0ab8f52d · fengjiayi · 015ccd44 · f85ccdd3 · 0ab8f52d · 0ab8f52d
288 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,9 @@ third_party/

 # clion workspace.
 cmake-build-*
+
+# generated while compiling
+python/paddle/v2/framework/core.so
+CMakeFiles
+cmake_install.cmake
+
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,3 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
+    hooks:
+      -   id: go-fmt
+          types: [go]
+      -   id: gometalinter
+          types: [go]
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ cache:
    - $HOME/.ccache
    - $HOME/.cache/pip
    - $TRAVIS_BUILD_DIR/build/third_party
+    - $TRAVIS_BUILD_DIR/build_android/third_party
 sudo: required
 dist: trusty
 os:
@@ -11,6 +12,7 @@ os:
 env:
  - JOB=build_doc
  - JOB=check_style
+  - JOB=build_android
 addons:
  apt:
    packages:
@@ -33,16 +35,19 @@ addons:
      - ccache
 before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    timeout 2580 paddle/scripts/travis/${JOB}.sh  # 43min timeout
+    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
  email:

--- a/AUTHORS.md
+++ b/AUTHORS.md
 | Github account | name |
 |---|---|
-| reyoung | Yang Yu |
+| backyes | Yan-Fei Wang |
+| beckett1124 | Bin Qi |
+| Canpio | Jia-Yi Feng |
+| chengxiaohua1105 | Xiao-Hua Cheng |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| cxysteven | Xing-Yi Cheng |
+| dzhwinter | Zhi-Hong Dong |
+| emailweixu | Wei Xu |
 | gangliao | Gang Liao |
-| luotao01 | Tao Luo |
-| jacquesqiao | Long-Fei Qiao |
-| qingqing01 | Qing-Qing Dang |
+| gongweibao | Wei-Bao Gong |
+| Guo Sheng | Sheng Guo |
+| Haichao-Zhang | Hai-Chao Zhang |
 | hedaoyuan | Dao-Yuan He |
-| wangyang59 | Yang Wang |
+| helinwang | He-Lin Wang |
+| jacquesqiao | Long-Fei Qiao |
+| kuke | Yi-Bing Liu |
+| lcy-seso | Ying Cao |
+| lipeng-unisound | Peng Li |
+| liuyuan | Yuan Liu |
+| livc | Zhao Li |
+| llxxxll | Yong-Feng Liu |
+| luotao01 | Tao Luo |
+| lzhao4ever | Liang Zhao |
+| NHZlX | Zhao-Long Xing |
+| pakchoi | Chuan-Jiang Song |
+| pengli09 | Peng Li |
+| pkuyym | Ya-Ming Yang |
 | QiJune | Jun Qi |
+| qingqing01 | Qing-Qing Dang |
+| reyoung | Yang Yu |
+| Superjom | Chun-Wei Yan |
 | tianbingsz | Tian-Bing Xu |
-| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
 | typhoonzero | Yi Wu |
-| backyes | Yan-Fei Wang |
-| pengli09 | Peng Li |
-| livc | Zhao Li |
+| wanghaoshuang | Hao-Shuang Wang |
+| wangyang59 | Yang Wang |
+| wangzhen-nlp | Zhen Wang |
+| wen-bo-yang | Wen-Bo Yang |
+| wwhu | Wei-Wei Hu |
+| xinghai-sun | Xing-Hai Sun |
 | Xreki | Yi-Qun Liu |
+| xujun05 | Jun Xu |
+| xushaoyong | Shao-Yong Xu |
 | Yancey1989 | Xu Yan |
-| emailweixu | Wei Xu |
-| wen-bo-yang | Wen-Bo Yang |
-| helinwang | He-Lin Wang |
-| lcy-seso | Ying Cao |
-| Zrachel | Rui-Qing Zhang |
-| Haichao-Zhang | Hai-Chao Zhang |
-| gongweibao | Wei-Bao Gong |
-| lzhao4ever | Liang Zhao |
+| zhaopu7 | Pu Zhao |
 | zhouxiao-coder | Xiao Zhou |
-| lipeng-unisound | Peng Li |
+| Zrachel | Rui-Qing Zhang |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -13,9 +13,9 @@
 # limitations under the License

 cmake_minimum_required(VERSION 3.0)
-
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})

 include(system)

@@ -27,7 +27,9 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
-find_package(Boost QUIET)
+if(NOT ANDROID)
+    find_package(Boost QUIET)
+endif()

 include(simd)

@@ -49,6 +51,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
+option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)

 # CMAKE_BUILD_TYPE
@@ -95,7 +98,10 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
+include(external/pybind11)    # download pybind11

+include(cudnn)              # set cudnn libraries, must before configure
+include(configure)          # add paddle env configuration
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
@@ -103,15 +109,14 @@ include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(rdma)               # set rdma libraries
 include(flags)              # set paddle compile flags
-include(cudnn)              # set cudnn libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-include(configure)          # add paddle env configuration
+

 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
+include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 include_directories(${Boost_INCLUDE_DIRS})

 set(EXTERNAL_LIBS
@@ -131,20 +136,26 @@ if(WITH_GPU)
 endif(WITH_GPU)

 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)

 add_subdirectory(proto)

+# "add_subdirectory(go)" should be placed after the following loine,
+# because it depends on paddle/optimizer.
+add_subdirectory(paddle/optimizer)
+
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
-  add_subdirectory(go/master/c)
-  add_subdirectory(go/pserver/cclient)
+    add_subdirectory(go)
 endif(WITH_GOLANG)

 add_subdirectory(paddle)
-add_subdirectory(python)
+if(WITH_PYTHON)
+  add_subdirectory(python)
+endif()
 if(WITH_DOC)
    add_subdirectory(doc)
 endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
    git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-numpy python-matplotlib gcc g++ \
    automake locales clang-format-3.8 swig doxygen cmake  \
@@ -34,14 +34,18 @@ RUN apt-get update && \
    net-tools && \
    apt-get clean -y

-# Install Go
+# Install Go and glide
 RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
    tar -C /usr/local -xzf go.tgz && \
    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
    rm go.tgz
 ENV GOROOT=/usr/local/go GOPATH=/root/gopath
 # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
-ENV PATH=${PATH}:${GOROOT}/bin
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -q https://glide.sh/get | sh

 # git credential to skip password typing
 RUN git config --global credential.helper store
@@ -57,7 +61,7 @@ RUN pip install --upgrade pip && \
    pip install -U docopt PyYAML sphinx && \
    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
-    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ 
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
    pip install rarfile

 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y

+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
 # git credential to skip password typing
 RUN git config --global credential.helper store


--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@


 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -61,35 +61,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation

 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)

 ## Documentation

-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
+[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.

 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)

  You might want to start from the this online interactive book that can run in Jupyter Notebook.

- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)

  You can run distributed training jobs on MPI clusters.

- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)

   You can also run distributed training jobs on Kubernetes clusters.

- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)

   Our new API enables much shorter programs.

- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)

   We appreciate your contributions!

+
 ## Ask Questions

 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -79,6 +79,9 @@ if(WITH_GOLANG)
  set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
  file(MAKE_DIRECTORY ${GOPATH})
  set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
+  file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
+  set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
+
  add_custom_target(go_path)
  add_custom_command(TARGET go_path
    # Symlink Paddle directory into GOPATH
@@ -89,7 +92,29 @@ if(WITH_GOLANG)
    # We can't run `go get -d ./...` for every target, because
    # multiple `go get` can not run concurrently, but make need to be
    # able to run with multiple jobs.
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./go/...
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
  )
+
+  if (GLIDE_INSTALL)
+    if(EXISTS $ENV{GOPATH}/bin/glide)
+      set(GLIDE "$ENV{GOPATH}/bin/glide")
+    else()
+      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
+    endif()
+
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
+      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
+      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
+  endif()
+
 endif(WITH_GOLANG)
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -25,8 +25,10 @@ set(STYLE_FILTER "${STYLE_FILTER}-readability/casting")
 set(IGNORE_PATTERN
    .*ImportanceSampler.*
    .*cblas\\.h.*
+    .*\\.pb\\.txt
    .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)

 # add_style_check_target
 #
@@ -51,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                endif()
            endforeach()
            if(LINT MATCHES ON)
+                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
            endif()
        endforeach()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -106,6 +106,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
                SET(CMAKE_SYSTEM_PROCESSOR armv7-a)
            ENDIF()
        ENDIF()
+        IF(ANDROID_ABI STREQUAL "arm64-v8a")
+            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
+        ENDIF()
        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    ENDIF()

@@ -162,6 +166,10 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
    ENDIF()

+    IF(ANDROID_ABI STREQUAL "arm64-v8a")
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+    ENDIF()
+
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
    STRING(REPLACE ";" " " ANDROID_LINKER_FLAGS "${ANDROID_LINKER_FLAGS}")

@@ -186,6 +194,10 @@ ELSE()
        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
    ENDIF()
    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
--- a/cmake/external/any.cmake
+++ b/cmake/external/any.cmake
@@ -2,10 +2,10 @@ INCLUDE(ExternalProject)

 SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)

-INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
+INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)

 ExternalProject_Add(
-    linb_any
+    extern_lib_any
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
@@ -17,5 +17,15 @@ ExternalProject_Add(
    TEST_COMMAND      ""
 )

+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(lib_any STATIC ${dummyfile})
+else()
+    add_library(lib_any INTERFACE)
+endif()
+
+add_dependencies(lib_any extern_lib_any)
+
 add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
-LIST(APPEND external_project_dependencies linb_any)
\ No newline at end of file
+LIST(APPEND external_project_dependencies lib_any)
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -2,10 +2,10 @@ INCLUDE(ExternalProject)

 SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)

-INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
+INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)

 ExternalProject_Add(
-    eigen3
+    extern_eigen3
    ${EXTERNAL_PROJECT_LOG_ARGS}
    # for latest version, please get from official website
    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
@@ -26,4 +26,14 @@ ExternalProject_Add(
    TEST_COMMAND      ""
 )

+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
+    add_library(eigen3 STATIC ${dummyfile})
+else()
+    add_library(eigen3 INTERFACE)
+endif()
+
+add_dependencies(eigen3 extern_eigen3)
+
 LIST(APPEND external_project_dependencies eigen3)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -38,18 +38,21 @@ ExternalProject_Add(
    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS      -DWITH_GFLAGS=ON
    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
    CMAKE_ARGS      -DBUILD_TESTING=OFF
    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=Release
 )

 ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
 SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
-ADD_DEPENDENCIES(glog extern_glog)
+ADD_DEPENDENCIES(glog extern_glog gflags)
+LINK_LIBRARIES(glog gflags)

 LIST(APPEND external_project_dependencies glog)
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)

 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
  set(NNPACK_FOUND ON)
  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -32,7 +32,12 @@ IF(NOT ${CBLAS_FOUND})
            # arm_soft_fp_abi branch of OpenBLAS to support softfp
            #   https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
            SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
+            IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+                SET(TARGET "ARMV7")
+            ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
+                SET(TARGET "ARMV8")
+            ENDIF()
+            SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=${TARGET} ARM_SOFTFP_ABI=1 USE_THREAD=0)
        ELSEIF(RPI)
            # use hardfp
            SET(OPENBLAS_COMMIT "v0.2.19")

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
 FIND_PACKAGE(Protobuf QUIET)
 SET(PROTOBUF_FOUND "OFF")

+if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
+    function(protobuf_generate_python SRCS)
+        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+        if(NOT ARGN)
+            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+            return()
+        endif()
+
+        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            # Create an include path for each file specified
+            foreach(FIL ${ARGN})
+                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        else()
+            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+        endif()
+
+        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+        endif()
+
+        if(DEFINED Protobuf_IMPORT_DIRS)
+            foreach(DIR ${Protobuf_IMPORT_DIRS})
+                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        endif()
+
+        set(${SRCS})
+        foreach(FIL ${ARGN})
+            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+            get_filename_component(FIL_WE ${FIL} NAME_WE)
+            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+                if(FIL_DIR)
+                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+                endif()
+            endif()
+
+            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+            add_custom_command(
+                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
+                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                    VERBATIM )
+        endforeach()
+
+        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    endfunction()
+endif()

 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
+INCLUDE(ExternalProject)
+
+SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
+
+LIST(APPEND external_project_dependencies pybind)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,6 +18,9 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
 IF(WITH_PYTHON)
    FIND_PACKAGE(PythonLibs 2.7)
+    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 ENDIF(WITH_PYTHON)

 SET(py_env "")

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -109,7 +109,9 @@ set(COMMON_FLAGS
    -Wno-unused-function
    -Wno-error=literal-suffix
    -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in Pybind11
+)

 set(GPU_COMMON_FLAGS
    -fPIC

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -17,7 +17,7 @@
 # generic.cmake defines CMakes functions that look like Bazel's
 # building rules (https://bazel.build/).
 #
-# 
+#
 # -------------------------------------------
 #     C++        CUDA C++       Go
 # -------------------------------------------
@@ -25,51 +25,51 @@
 # cc_binary     nv_binary    go_binary
 # cc_test       nv_test      go_test
 # -------------------------------------------
-# 
+#
 # To build a static library example.a from example.cc using the system
 #  compiler (like GCC):
-# 
+#
 #   cc_library(example SRCS example.cc)
-# 
+#
 # To build a static library example.a from multiple source files
 # example{1,2,3}.cc:
-# 
+#
 #   cc_library(example SRCS example1.cc example2.cc example3.cc)
-# 
+#
 # To build a shared library example.so from example.cc:
-# 
+#
 #   cc_library(example SHARED SRCS example.cc)
-# 
+#
 # To build a library using Nvidia's NVCC from .cu file(s), use the nv_
 # prefixed version:
-# 
+#
 #   nv_library(example SRCS example.cu)
-# 
+#
 # To specify that a library new_example.a depends on other libraies:
-# 
+#
 #   cc_library(new_example SRCS new_example.cc DEPS example)
-# 
+#
 # Static libraries can be composed of other static libraries:
-# 
+#
 #   cc_library(composed DEPS dependent1 dependent2 dependent3)
-# 
+#
 # To build an executable binary file from some source files and
 # dependent libraries:
-# 
+#
 #   cc_binary(example SRCS main.cc something.cc DEPS example1 example2)
-# 
+#
 # To build an executable binary file using NVCC, use the nv_ prefixed
 # version:
-# 
+#
 #   nv_binary(example SRCS main.cc something.cu DEPS example1 example2)
-# 
+#
 # To build a unit test binary, which is an executable binary with
 # GoogleTest linked:
-# 
+#
 #   cc_test(example_test SRCS example_test.cc DEPS example)
-# 
+#
 # To build a unit test binary using NVCC, use the nv_ prefixed version:
-# 
+#
 #   nv_test(example_test SRCS example_test.cu DEPS example)
 #
 # It is pretty often that executable and test binaries depend on
@@ -88,36 +88,57 @@
 #

 # including binary directory for generated headers.
-include_directories(${CMAKE_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})

-if(NOT APPLE)
+if(NOT APPLE AND NOT ANDROID)
    find_package(Threads REQUIRED)
    link_libraries(${CMAKE_THREAD_LIBS_INIT})
-endif(NOT APPLE)
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
+endif(NOT APPLE AND NOT ANDROID)

 function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
  list(REMOVE_DUPLICATES libs)

-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
  foreach(lib ${libs})
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
+  list(REMOVE_DUPLICATES libs_deps)

  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps
+    # also help to track dependencies.
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
    add_library(${TARGET_NAME} STATIC ${dummyfile})
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
-	else() # general UNIX: use "ar" to extract objects and re-add to a common lib
+  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
    foreach(lib ${libs})
      set(objlistfile ${lib}.objlist) # list of objects in the input library
      set(objdir ${lib}.objdir)

      add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})

      add_custom_command(OUTPUT ${objlistfile}
        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@@ -125,27 +146,27 @@ function(merge_static_libs TARGET_NAME)
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})

-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		

      list(APPEND mergebases "${mergebase}")
    endforeach()

-    # We need a target for the output merged library
    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    # Get the file name of the generated library
    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")

    foreach(lib ${libs})
      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
-      WORKING_DIRECTORY ${lib}.objdir)
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
+        WORKING_DIRECTORY ${lib}.objdir)
    endforeach()
-
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
  endif()
 endfunction(merge_static_libs)

@@ -162,7 +183,12 @@ function(cc_library TARGET_NAME)
    endif()
    if (cc_library_DEPS)
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
+    
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+
  else(cc_library_SRCS)
    if (cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -193,7 +219,7 @@ function(cc_test TARGET_NAME)
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_test(${TARGET_NAME} ${TARGET_NAME})
+    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)

@@ -211,6 +237,7 @@ function(nv_library TARGET_NAME)
      endif()
      if (nv_library_DEPS)
        add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+        target_link_libraries(${TARGET_NAME} ${nv_library_DEPS})
      endif()
    else(nv_library_SRCS)
      if (nv_library_DEPS)
@@ -263,8 +290,22 @@ function(go_library TARGET_NAME)
    set(${TARGET_NAME}_LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}" CACHE STRING "output library name for target ${TARGET_NAME}")
  endif()

-  # Add dummy code to support `make target_name` under Terminal Command
  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+  # This custom command will always run since it depends on a not
+  # existing file.
+  add_custom_command(
+    OUTPUT dummy_rebulid_${TARGET_NAME}
+    COMMAND cmake -E touch ${dummyfile}
+    )
+  # Create a custom target that depends on the custom command output
+  # file, so the custom command can be referenced as a dependency by
+  # `add_dependencies`.
+  add_custom_target(rebuild_${TARGET_NAME}
+    DEPENDS dummy_rebulid_${TARGET_NAME}
+    )
+
+  # Add dummy code to support `make target_name` under Terminal Command
  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
  if (go_library_SHARED OR go_library_shared)
    add_library(${TARGET_NAME} SHARED ${dummyfile})
@@ -275,17 +316,26 @@ function(go_library TARGET_NAME)
    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
  endif(go_library_DEPS)

+  # The "source file" of the library is `${dummyfile}` which never
+  # change, so the target will never rebuild. Make the target depends
+  # on the custom command that touches the library "source file", so
+  # rebuild will always happen.
+  add_dependencies(${TARGET_NAME} rebuild_${TARGET_NAME})
+
  set(${TARGET_NAME}_LIB_PATH "${CMAKE_CURRENT_BINARY_DIR}/${${TARGET_NAME}_LIB_NAME}" CACHE STRING "output library path for target ${TARGET_NAME}")

  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
    # Golang build source code
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${${TARGET_NAME}_LIB_PATH}"
-    ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_dependencies(${TARGET_NAME} go_path)
+    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
+    # must run under GOPATH
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_dependencies(${TARGET_NAME} go_vendor)
 endfunction(go_library)

 function(go_binary TARGET_NAME)
@@ -293,35 +343,49 @@ function(go_binary TARGET_NAME)
  set(oneValueArgs "")
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+
  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})
+    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)

 function(go_test TARGET_NAME)
  set(options OPTIONAL)
  set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
  cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_test_SRCS}
+    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_test(NAME ${TARGET_NAME}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
-  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)

 function(proto_library TARGET_NAME)
  set(oneValueArgs "")
-  set(multiValueArgs SRCS)
+  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  set(proto_srcs)
  set(proto_hdrs)
  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
-  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf)
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
+endfunction()
+
+function(py_proto_compile TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(py_srcs)
+  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
 endfunction()
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -445,6 +445,11 @@ smooth_l1_cost
 ..  autoclass:: paddle.v2.layer.smooth_l1_cost
    :noindex:

+multibox_loss
+--------------
+..  autoclass:: paddle.v2.layer.multibox_loss
+    :noindex:
+
 Check Layer
 ============

@@ -468,3 +473,16 @@ prelu
 --------
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
+
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
+Detection output Layer
+======================
+
+detection_output
+----------------
+..  autoclass:: paddle.v2.layer.detection_output
+    :noindex:
--- a/doc/design/simple_op_design.md
+++ b/doc/design/simple_op_design.md
@@ -63,7 +63,7 @@ message OpProto {

 To generate Python code automatically:

-```python
+```python 
 def create_python_ops_creatation_functions():
 	op_protos = paddle.framework.OpRegistry.get_all_op_proto()
 	for type_name in op_protos:

--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@

   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}

-假设 :math:`z = f(W^T x + b)` ，那么
+假设 :math:`z = W^T x + b` ，那么

 .. math::


--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It

 where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.

-The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. 
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.

 Suppose our loss function is :math:`c(y)`, then

@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then

   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}

-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then

 .. math::

@@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
 Then, for fully connected layer, we need to compute:

 .. math::
-  
+
   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1

 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
                      /* weight */ true);
      }
    }
-    
+
 If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.

 .. code-block:: bash

--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使

    python -c "import py_paddle"

-如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。

 如果提示正确，可以执行以下命令编译生成文档，即
@@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org文档
 ================================

-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。




--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
@@ -101,7 +101,7 @@
    </div>
    <div class="site-nav-links">
      <div class="site-menu">
-        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
        <div class="language-switcher dropdown">
          <a type="button" data-toggle="dropdown">
            <span>English</span>

--- a/go/.gitignore
+++ b/go/.gitignore
+vendor/
+.glide/
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+add_subdirectory(pserver/client/c)
+add_subdirectory(cmd/pserver)
+add_subdirectory(cmd/master)
+add_subdirectory(master/c)
+add_subdirectory(master)
+add_subdirectory(pserver)
+add_subdirectory(pserver/client)
+add_subdirectory(utils/networkhelper)
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+go_binary(master SRC master.go)
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -11,6 +11,7 @@ import (

 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"

 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +21,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()

+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+
+	log.SetLevel(level)
+
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}

--- a/go/cmd/pserver/CMakeLists.txt
+++ b/go/cmd/pserver/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+go_binary(pserver SRCS pserver.go DEPS paddle_go_optimizer)
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -8,6 +8,7 @@ import (
 	"time"

 	"github.com/namsral/flag"
+	"github.com/topicai/candy"

 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
@@ -15,46 +16,50 @@ import (

 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
+	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()

 	level, err := log.ParseLevel(*logLevel)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	log.SetLevel(level)

-	timeout := time.Second * time.Duration((*etcdTimeout))
-	e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
-	idx, err := e.Register()
-	if err != nil {
-		panic(err)
-	}
+	var idx int

-	s, err := pserver.NewService(idx)
-	if err != nil {
-		panic(err)
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
+	if *index >= 0 {
+		idx = *index
+	} else {
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
+		idx, err = e.Register(*port)
+		candy.Must(err)
+
+		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
+		if err != nil {
+			log.Errorf("Fetch checkpoint failed, %s", err)
+		}
 	}
+
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
+	candy.Must(err)
+
 	err = rpc.Register(s)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)

 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)

 	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
-
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 }
--- a/go/glide.lock
+++ b/go/glide.lock
+hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
+updated: 2017-07-11T10:04:40.786745417+08:00
+imports:
+- name: github.com/coreos/etcd
+  version: cb2a496c4ddd1c87a9f280e116649b599999ec79
+  subpackages:
+  - auth/authpb
+  - clientv3
+  - clientv3/concurrency
+  - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/etcdserverpb
+  - mvcc/mvccpb
+- name: github.com/golang/protobuf
+  version: 4bd1920723d7b7c925de087aa32e2187708897f7
+  subpackages:
+  - jsonpb
+  - proto
+- name: github.com/golang/snappy
+  version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/namsral/flag
+  version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
+- name: github.com/PaddlePaddle/recordio
+  version: edfb82af0739c84f241c87390ec5649c7b28c129
+- name: github.com/sirupsen/logrus
+  version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: golang.org/x/net
+  version: c8c74377599bd978aee1cf3b9b63a8634051cec2
+  subpackages:
+  - context
+  - http2
+  - http2/hpack
+  - idna
+  - internal/timeseries
+  - lex/httplex
+  - trace
+- name: golang.org/x/sys
+  version: abf9c25f54453410d0c6668e519582a9e1115027
+  subpackages:
+  - unix
+- name: golang.org/x/text
+  version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
+  subpackages:
+  - secure/bidirule
+  - transform
+  - unicode/bidi
+  - unicode/norm
+- name: google.golang.org/grpc
+  version: 8050b9cbc271307e5a716a9d782803d09b0d6f2d
+  subpackages:
+  - codes
+  - credentials
+  - grpclog
+  - internal
+  - keepalive
+  - metadata
+  - naming
+  - peer
+  - stats
+  - tap
+  - transport
+testImports: []
--- a/go/glide.yaml
+++ b/go/glide.yaml
+package: github.com/PaddlePaddle/Paddle/go
+import:
+- package: github.com/PaddlePaddle/recordio
+- package: github.com/coreos/etcd
+  version: ^3.2.1
+  subpackages:
+  - clientv3
+  - clientv3/concurrency
+- package: github.com/namsral/flag
+  version: ^1.7.4-pre
+- package: github.com/sirupsen/logrus
+  version: ^1.0.0
+- package: github.com/topicai/candy
--- a/go/master/CMakeLists.txt
+++ b/go/master/CMakeLists.txt
+if(WITH_TESTING)
+  go_test(master_test)
+endif()
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
-
-go_library(paddle_master SHARED)
+go_library(paddle_master SHARED DEPS paddle_go_optimizer)
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )

-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -104,12 +103,23 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }

+// return value:
+//     0:ok
+//    -1:error
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
-	r := c.NextRecord()
+	r, err := c.NextRecord()
+	if err != nil {
+		// Error
+		// TODO: return the type of error?
+		*record = (*C.uchar)(nil)
+		return -1
+	}
+
 	if len(r) == 0 {
-		*record = (*C.uchar)(nullPtr)
+		// Empty record
+		*record = (*C.uchar)(nil)
 		return 0
 	}


--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,6 +2,7 @@ package master

 import (
 	"os"
+	"time"

 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
@@ -11,7 +12,12 @@ import (
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
-	ch   chan []byte
+	ch   chan record
+}
+
+type record struct {
+	r   []byte
+	err error
 }

 // NewClient creates a new Client.
@@ -21,7 +27,7 @@ type Client struct {
 func NewClient(addrCh <-chan string, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan []byte, bufSize)
+	c.ch = make(chan record, bufSize)
 	go c.monitorMaster(addrCh)
 	go c.getRecords()
 	return c
@@ -31,9 +37,9 @@ func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// TODO(helin): wait before move on with next
 			// getTask call.
-			log.Errorln(err)
+			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
+			time.Sleep(3 * time.Second)
 			continue
 		}

@@ -46,10 +52,11 @@ func (c *Client) getRecords() {

 			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
 			for s.Scan() {
-				c.ch <- s.Record()
+				c.ch <- record{s.Record(), nil}
 			}

 			if s.Err() != nil {
+				c.ch <- record{nil, s.Err()}
 				log.Errorln(err, chunk.Path)
 			}

@@ -62,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }

@@ -112,10 +122,16 @@ func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }

+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFailed", meta, nil)
+}
+
 // NextRecord returns next record in the dataset.
 //
 // NextRecord will block until the next record is available. It is
 // thread-safe.
-func (c *Client) NextRecord() []byte {
-	return <-c.ch
+func (c *Client) NextRecord() ([]byte, error) {
+	r := <-c.ch
+	return r.r, r.err
 }
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {

 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
+
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()

 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
@@ -95,10 +109,16 @@ func TestGetFinishTask(t *testing.T) {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}

-		err = c.taskFinished(tasks[0].ID)
+		err = c.taskFinished(tasks[0].Meta.ID)
 		if err != nil {
 			t.Fatalf("Error: %v, pass: %d\n", err, i)
 		}
+
+		err = c.taskFailed(tasks[0].Meta)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+
 		tasks = tasks[1:]
 		task, err := c.getTask()
 		if err != nil {
@@ -107,7 +127,7 @@ func TestGetFinishTask(t *testing.T) {
 		tasks = append(tasks, task)

 		for _, task := range tasks {
-			err = c.taskFinished(task.ID)
+			err = c.taskFinished(task.Meta.ID)
 			if err != nil {
 				t.Fatalf("Error: %v, pass: %d\n", err, i)
 			}

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,23 +57,44 @@ func TestNextRecord(t *testing.T) {

 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+
+	err = w.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
+
+	err = f.Close()
+	if err != nil {
+		panic(err)
+	}
+
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
+
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
-			r := c.NextRecord()
+			r, err := c.NextRecord()
+			if err != nil {
+				t.Fatal(pass, i, "Read error:", err)
+			}
+
 			if len(r) != 1 {
-				t.Fatal("Length should be 1.", r)
+				t.Fatal(pass, i, "Length should be 1.", r)
 			}
+
 			if received[r[0]] {
-				t.Fatal("Received duplicate.", received, r)
+				t.Fatal(pass, i, "Received duplicate.", received, r)
 			}
 			received[r[0]] = true
 		}

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -50,7 +50,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	lock := concurrency.NewMutex(sess, lockPath)
 	// It's fine for the lock to get stuck, in this case we have
 	// multiple master servers running (only configured to have
-	// one master running, but split-brain problem may cuase
+	// one master running, but split-brain problem may cause
 	// multiple master servers running), and the cluster management
 	// software will kill one of them.
 	log.Debugf("Trying to acquire lock at %s.", lockPath)
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)

-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err
@@ -98,7 +98,7 @@ func (e *EtcdClient) Save(state []byte) error {
 			// We lost the master lock and can not acquire
 			// it back, it means some other master is
 			// already started. We don't want cluster
-			// managment system to kill the master server
+			// management system to kill the master server
 			// who is holding the lock and running
 			// correctly. So the most feasible solution is
 			// to kill current master server. The current

--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"

 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -31,30 +31,36 @@ type Chunk struct {
 	Index recordio.Index // chunk index
 }

+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
+
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
-	ID     int
+	Meta   TaskMeta
 	Chunks []Chunk
 }

 type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
+	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
 }

 type taskQueues struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
-	Failed  []Task
+	Failed  []taskEntry
 }

 // Service is the master server service.
 type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
-	timeoutMax    int
+	failureMax    int
 	ready         chan struct{}
 	store         Store

@@ -73,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
+			cur.Task.Meta.ID = id
 			id++
 			result = append(result, cur)
 			cur.Task.Chunks = nil
@@ -83,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}

 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}

@@ -91,11 +97,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }

 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
+	s.failureMax = failureMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -154,7 +160,7 @@ func (s *Service) recover() (bool, error) {

 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -209,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}

 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -257,6 +264,33 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }

+func (s *Service) processFailedTask(t taskEntry, epoch int) {
+	if t.Task.Meta.Epoch != epoch {
+		// new epoch, task launched after the
+		// schedule of this timeout check or failed status report.
+		return
+	}
+
+	defer func() {
+		err := s.snapshot()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}()
+
+	delete(s.taskQueues.Pending, t.Task.Meta.ID)
+
+	t.NumFailure++
+	if t.NumFailure > s.failureMax {
+		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		return
+	}
+
+	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+}
+
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 	return func() {
 		s.mu.Lock()
@@ -267,30 +301,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}

-		if t.Epoch != epoch {
-			// new epoch, task launched after the
-			// schedule of this timeout check.
-			return
-		}
-
-		defer func() {
-			err := s.snapshot()
-			if err != nil {
-				log.Errorln(err)
-			}
-		}()
-
-		delete(s.taskQueues.Pending, t.Task.ID)
-
-		t.NumTimeout++
-		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
-			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-			return
-		}
-
-		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
-		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+		s.processFailedTask(t, epoch)
 	}
 }

@@ -339,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}

 	t := s.taskQueues.Todo[0]
-	t.Epoch++
+	t.Task.Meta.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
+	s.taskQueues.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
 	}

 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)

-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }

@@ -365,13 +376,12 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {

 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
-		return err
+		return nil
 	}

 	// task finished, reset timeout
-	t.NumTimeout = 0
+	t.NumFailure = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)

@@ -389,3 +399,22 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 	return err
 }
+
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.taskQueues.Pending[meta.ID]
+	if !ok {
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
+		return nil
+	}
+
+	s.processFailedTask(t, meta.Epoch)
+	return nil
+}
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		if ts[i].Task.Meta.ID != i {
 			t.Error(ts[i], i)
 		}
 	}

--- a/go/pserver/CMakeLists.txt
+++ b/go/pserver/CMakeLists.txt
+if(WITH_TESTING)
+  go_test(pserver_test DEPS paddle_go_optimizer)
+endif()
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
-go_library(paddle_pserver_cclient STATIC)
-
-add_subdirectory(test)
--- a/go/pserver/cclient/test/test_cclient.c
+++ b/go/pserver/cclient/test/test_cclient.c
-#include <stdio.h>
-#include <stdlib.h>
-
-#include "libpaddle_pserver_cclient.h"
-
-typedef float real;
-
-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
-  exit(-1);
-}
-
-void print_parameter(paddle_gradient* param) {
-  if (param == NULL) {
-    printf("param is NULL!!\n");
-  } else {
-    printf("==== parameter ====\n");
-    printf("name: %s\n", param->name);
-    printf("content_len: %d\n", param->content_len);
-    printf("content_type: %d\n", param->element_type);
-    int i;
-    for (i = 0; i < param->content_len / (int)sizeof(real); ++i) {
-      printf("%f ", ((float*)param->content)[i]);
-    }
-    printf("\n\n");
-  }
-}
-
-int main() {
-  char addr[] = "localhost:3000";
-  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
-
-  char* names[] = {"param_a", "param_b"};
-
-retry:
-  printf("init parameter to pserver:\n");
-
-  real param_content1[] = {0.1, 0.2, 0.3};
-  real param_content2[] = {0.4, 0.5, 0.6};
-  paddle_parameter** params =
-      (paddle_parameter**)malloc(sizeof(paddle_parameter*) * 2);
-  params[0] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
-  params[0]->name = names[0];
-  params[0]->content = (unsigned char*)param_content1;
-  params[0]->content_len = 3 * sizeof(real);
-  params[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-
-  params[1] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
-  params[1]->name = names[1];
-  params[1]->content = (unsigned char*)param_content2;
-  params[1]->content_len = 3 * sizeof(real);
-  params[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
-
-  if (paddle_begin_init_params(c)) {
-    if (paddle_init_param(c, *params[0], NULL, 0) != 0) {
-      goto retry;
-    }
-    if (paddle_init_param(c, *params[1], NULL, 0) != 0) {
-      goto retry;
-    }
-    if (paddle_finish_init_params(c) != 0) {
-      goto retry;
-    }
-  } else {
-    fail();
-  }
-
-  printf("get inited parameters from pserver:\n");
-  // get parameters again by reusing the allocated parameter buffers.
-  if (paddle_get_params(c, params, 2) != 0) {
-    fail();
-  }
-  print_parameter(params[0]);
-  print_parameter(params[1]);
-
-  printf("send gradient to pserver:\n");
-  real gradient_content1[] = {0.01, 0.02, 0.03};
-  real gradinet_content2[] = {0.04, 0.05, 0.06};
-
-  paddle_gradient** grads =
-      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
-  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
-  grads[0]->name = names[0];
-  grads[0]->content = (unsigned char*)gradient_content1;
-  grads[0]->content_len = 3 * sizeof(real);
-  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-
-  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
-  grads[1]->name = names[1];
-  grads[1]->content = (unsigned char*)gradinet_content2;
-  grads[1]->content_len = 3 * sizeof(real);
-  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
-
-  printf("print gradient sent to pserver:\n");
-  print_parameter(grads[0]);
-  print_parameter(grads[1]);
-
-  if (paddle_send_grads(c, grads, 2) != 0) {
-    fail();
-  }
-
-  printf("get updated parameters from pserver:\n");
-  // get parameters again by reusing the allocated parameter buffers.
-  if (paddle_get_params(c, params, 2) != 0) {
-    fail();
-  }
-  print_parameter(params[0]);
-  print_parameter(params[1]);
-
-  if (paddle_save_model(c, "/tmp/") != 0) {
-    fail();
-  }
-
-  return 0;
-}
--- a/go/pserver/client/CMakeLists.txt
+++ b/go/pserver/client/CMakeLists.txt
+if(WITH_TESTING)
+  go_test(pserver_client_test DEPS paddle_go_optimizer)
+endif()
--- a/go/pserver/client/c/.gitignore
+++ b/go/pserver/client/c/.gitignore
+libpaddle_go_optimizer.a
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
+cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
+target_link_libraries(paddle_go_optimizer stdc++ m)
+
+# Copy library to the required place.
+# See: go/pserver/optimizer.go:
+# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+add_custom_command(TARGET paddle_go_optimizer POST_BUILD
+  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
+  )
+
+go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
+if(WITH_TESTING)
+  # FIXME: this test requires pserver which is not managed by the test
+  # we need some kind of e2e testing machanism.
+  # add_subdirectory(test)
+endif()
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
@@ -30,15 +30,15 @@ import (
 	"unsafe"

 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
 	log "github.com/sirupsen/logrus"
 )

-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
+var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client

-func add(c *pserver.Client) C.paddle_pserver_client {
+func add(c *client.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -47,13 +47,13 @@ func add(c *pserver.Client) C.paddle_pserver_client {
 	return client
 }

-func get(client C.paddle_pserver_client) *pserver.Client {
+func get(client C.paddle_pserver_client) *client.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }

-func remove(client C.paddle_pserver_client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *client.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -62,7 +62,7 @@ func remove(client C.paddle_pserver_client) *pserver.Client {
 }

 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}

@@ -80,9 +80,9 @@ func (s selector) Select() bool {
 	return bool(s)
 }

-type lister []pserver.Server
+type lister []client.Server

-func (l lister) List() []pserver.Server {
+func (l lister) List() []client.Server {
 	return l
 }

@@ -90,19 +90,22 @@ func (l lister) List() []pserver.Server {
 func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
-	servers := make([]pserver.Server, len(as))
+	servers := make([]client.Server, len(as))
 	for i := range as {
 		servers[i].Index = i
 		servers[i].Addr = as[i]
 	}
-	c := pserver.NewClient(lister(servers), len(as), selector(selected != 0))
+	c := client.NewClient(lister(servers), len(as), selector(selected != 0))
 	return add(c)
 }

 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
-	// TODO(helin): fault tolerant pserver client using etcd.
-	panic("not implemented.")
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
+	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
+	addr := C.GoString(etcdEndpoints)
+	etcdClient := client.NewEtcd(addr)
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
+	return add(c)
 }

 //export paddle_pserver_client_release
@@ -120,20 +123,20 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }

 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)

 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -149,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}

@@ -219,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))

-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}

-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR

--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
-
-cc_binary(main SRCS main.c DEPS paddle_pserver_cclient)
-cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
+add_style_check_target(test_cclient test_cclient.c)
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
@@ -16,7 +16,7 @@ void sendGrads(paddle_pserver_client c) {
      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
  paddle_gradient grad2 = {
      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
-  paddle_gradient* grads[2] = {&grad1, &grad2};
+  paddle_gradient *grads[2] = {&grad1, &grad2};
  if (paddle_send_grads(c, grads, 2)) {
    fail();
  }
@@ -39,7 +39,7 @@ void getParams(paddle_pserver_client c) {
  param_b.content = content_b;
  param_b.content_len = 3000;

-  paddle_parameter* params[2] = {&param_a, &param_b};
+  paddle_parameter *params[2] = {&param_a, &param_b};
  if (paddle_get_params(c, params, 2)) {
    fail();
  }
@@ -48,6 +48,17 @@ void getParams(paddle_pserver_client c) {
 int main() {
  char addr[] = "localhost:3000";
  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+  char *config_proto;
+  size_t config_proto_len = 0;
+  ssize_t nread;
+  FILE *fp = fopen("testdata/optimizer.pb", "r");
+  if (!fp) {
+    fail();
+  }
+  while ((nread = getline(&config_proto, &config_proto_len, fp)) != -1) {
+    printf("%s", config_proto);
+  }
+  fclose(fp);
 retry:
  if (paddle_begin_init_params(c)) {
    paddle_parameter param;
@@ -59,7 +70,8 @@ retry:
    param.name = name_a;
    param.content = content_a;
    param.content_len = 2000;
-    int error = paddle_init_param(c, param, NULL, 0);
+    int error =
+        paddle_init_param(c, param, (void *)config_proto, config_proto_len);
    if (error != 0) {
      goto retry;
    }
@@ -68,7 +80,7 @@ retry:
    param.name = name_b;
    param.content = content_b;
    param.content_len = 3000;
-    error = paddle_init_param(c, param, NULL, 0);
+    error = paddle_init_param(c, param, (void *)config_proto, config_proto_len);
    if (error != 0) {
      goto retry;
    }

--- a/go/pserver/cclient/test/test_mnist.py
+++ b/go/pserver/cclient/test/test_mnist.py
--- a/go/pserver/cclient/test/test_train.py
+++ b/go/pserver/cclient/test/test_train.py
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoint = "http://" + etcd_ip + ":2379"
+
+
+def cloud_reader():
+    print "connecting to master, etcd endpoints: ", etcd_endpoint
+    master_client = master.client(etcd_endpoint, 5, 64)
+    master_client.set_dataset(
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+    while 1:
+        r, e = master_client.next_record()
+        if not r:
+            break
+        yield pickle.loads(r)


 def main():
@@ -19,14 +37,16 @@ def main():
    # create parameters
    parameters = paddle.parameters.create(cost)

-    # create optimizer
+    # create optimizer of new remote updater to pserver
    optimizer = paddle.optimizer.Momentum(momentum=0)

+    print "etcd endoint: ", etcd_endpoint
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoint,
+                                 use_etcd=True)

    # event_handler to print training and testing info
    def event_handler(event):
@@ -45,11 +65,11 @@ def main():
                print "Test %d, %.2f" % (event.pass_id, result.cost)

    # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
-            batch_size=2),
+                cloud_reader, buf_size=500), batch_size=2),
        feeding={'x': 0,
                 'y': 1},
        event_handler=event_handler,

--- a/go/pserver/client/c/test/testdata/optimizer.pb
+++ b/go/pserver/client/c/test/testdata/optimizer.pb
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
-package pserver
+package client

 import (
 	"errors"
@@ -7,6 +7,7 @@ import (
 	"time"

 	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
 )

@@ -105,7 +106,7 @@ func (c *Client) BeginInitParams() bool {
 }

 // InitParam initializes the parameter on parameter servers.
-func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
+func (c *Client) InitParam(paramWithConfigs pserver.ParameterWithConfig) error {
 	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
 }

@@ -123,13 +124,13 @@ func (c *Client) FinishInitParams() error {

 // SendGrads sends gradients to parameter servers for updating
 // parameters.
-func (c *Client) SendGrads(grads []Gradient) error {
+func (c *Client) SendGrads(grads []pserver.Gradient) error {
 	if len(grads) == 0 {
 		return errors.New("no gradient received")
 	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
-		go func(g Gradient) {
+		go func(g pserver.Gradient) {
 			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
 			errCh <- err
 		}(g)
@@ -151,7 +152,7 @@ func (c *Client) SendGrads(grads []Gradient) error {

 type result struct {
 	idx   int
-	param Parameter
+	param pserver.Parameter
 	err   error
 }

@@ -170,12 +171,12 @@ func (r results) Swap(i int, j int) {
 }

 // GetParams gets parameters from parameter servers.
-func (c *Client) GetParams(names []string) ([]Parameter, error) {
+func (c *Client) GetParams(names []string) ([]pserver.Parameter, error) {
 	rCh := make(chan result, len(names))

 	for idx, name := range names {
 		go func(name string, idx int) {
-			var parameter Parameter
+			var parameter pserver.Parameter
 			err := c.pservers[c.partition(name)].Call("Service.GetParam", name, &parameter)
 			rCh <- result{idx: idx, param: parameter, err: err}
 		}(name, idx)
@@ -196,7 +197,7 @@ func (c *Client) GetParams(names []string) ([]Parameter, error) {
 	}
 	sort.Sort(rs)

-	ps := make([]Parameter, len(rs))
+	ps := make([]pserver.Parameter, len(rs))
 	for i := range rs {
 		ps[i] = rs[i].param
 	}
@@ -232,7 +233,7 @@ func (c *Client) Save(path string) error {

 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }


--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
+package client_test
+
+import (
+	"context"
+	"io/ioutil"
+	"math/rand"
+	"net"
+	"net/http"
+	"net/rpc"
+	"strconv"
+	"strings"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/clientv3"
+	log "github.com/sirupsen/logrus"
+)
+
+const (
+	numPserver    = 10
+	etcdEndpoints = "127.0.0.1:2379"
+	timeout       = 2 * time.Second
+)
+
+var pserverClientPorts [numPserver]int
+
+// this function init pserver client and return their ports in an array.
+func initClient() [numPserver]int {
+	var ports [numPserver]int
+	for i := 0; i < numPserver; i++ {
+		l, err := net.Listen("tcp", ":0")
+		if err != nil {
+			panic(err)
+		}
+
+		ss := strings.Split(l.Addr().String(), ":")
+		p, err := strconv.Atoi(ss[len(ss)-1])
+		if err != nil {
+			panic(err)
+		}
+		ports[i] = p
+
+		go func(l net.Listener) {
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, 1, "", nil, cp)
+			if err != nil {
+				panic(err)
+			}
+			server := rpc.NewServer()
+			err = server.Register(s)
+			if err != nil {
+				panic(err)
+			}
+
+			mux := http.NewServeMux()
+			mux.Handle(rpc.DefaultRPCPath, server)
+			err = http.Serve(l, mux)
+			if err != nil {
+				panic(err)
+			}
+		}(l)
+	}
+	return ports
+}
+
+func initNativeClient() {
+	pserverClientPorts = initClient()
+}
+
+func initEtcdClient() {
+	client, err := clientv3.New(clientv3.Config{
+		Endpoints:   []string{etcdEndpoints},
+		DialTimeout: time.Second * time.Duration(1),
+	})
+	if err != nil {
+		log.Errorf("err %v", err)
+	}
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	_, err = client.Delete(ctx, pserver.PsDesired)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
+
+	ports := initClient()
+	for i := 0; i < numPserver; i++ {
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
+	}
+	cancel()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
+}
+
+type selector bool
+
+func (s selector) Select() bool {
+	return bool(s)
+}
+
+type lister []client.Server
+
+func (l lister) List() []client.Server {
+	return l
+}
+
+func testClient(t *testing.T, c *client.Client) {
+	selected := c.BeginInitParams()
+	if !selected {
+		t.Fatal("should be selected.")
+	}
+
+	const numParameter = 1000
+	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
+	if err != nil {
+		t.Fatalf("read optimizer proto failed")
+	}
+
+	var wg sync.WaitGroup
+	for i := 0; i < numParameter; i++ {
+		wg.Add(1)
+		go func(i int) {
+			var p pserver.Parameter
+			p.Name = "p_" + strconv.Itoa(i)
+			p.ElementType = pserver.Float32
+			p.Content = make([]byte, (i+1)*100)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
+	}
+	wg.Wait()
+
+	err = c.FinishInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	var grads []pserver.Gradient
+	for i := 0; i < numParameter; i++ {
+		var g pserver.Gradient
+		g.Name = "p_" + strconv.Itoa(i)
+		g.ElementType = pserver.Float32
+		g.Content = make([]byte, (i+1)*100)
+		grads = append(grads, g)
+	}
+
+	const paramPerGroup = 10
+	const numGroups = numParameter / paramPerGroup
+
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err := c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
+	}
+
+	names := make([]string, numParameter)
+	for i := 0; i < numParameter; i++ {
+		names[i] = "p_" + strconv.Itoa(i)
+	}
+
+	for i := 0; i < numGroups; i++ {
+		var ns []string
+		if i == numGroups-1 {
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+
+		wg.Add(1)
+		go func(ns []string) {
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			if len(ns) != len(params) {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+			}
+
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
+	}
+
+	wg.Wait()
+}
+
+func TestNativeClient(t *testing.T) {
+	initNativeClient()
+	servers := make([]client.Server, numPserver)
+	for i := 0; i < numPserver; i++ {
+		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
+	}
+	c1 := client.NewClient(lister(servers), len(servers), selector(true))
+	testClient(t, c1)
+}
+
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
+func EtcdClient(t *testing.T) {
+	initEtcdClient()
+	etcdClient := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
+	testClient(t, c2)
+}
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
+package client
+
+import (
+	"context"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver"
+	"github.com/coreos/etcd/clientv3"
+	log "github.com/sirupsen/logrus"
+)
+
+const (
+	defaultEtcdTimeout time.Duration = 5 * time.Second
+)
+
+// EtcdClient is used by pserver client that is a part of trainer process.
+// TODO:
+// 1. add watcher to watch the change state of pservers)
+// 1. add etcd lock)
+type EtcdClient struct {
+	client    *clientv3.Client
+	timeout   time.Duration
+	endpoints []string
+}
+
+// Desired read ps desired number from etcd.
+func (p *EtcdClient) Desired() int {
+	var psDesired int
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+		resp, err := p.client.Get(ctx, pserver.PsDesired)
+		cancel()
+		if err != nil {
+			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
+			time.Sleep(p.timeout)
+			continue
+		}
+
+		kvs := resp.Kvs
+		if len(kvs) == 0 {
+			log.Infoln("Waiting for ps desired registered ...")
+			time.Sleep(p.timeout)
+			continue
+		}
+
+		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+		if err != nil {
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
+			time.Sleep(p.timeout)
+			continue
+		}
+
+		log.Debugf("Get psDesired number: %d", psDesired)
+		break
+	}
+	return psDesired
+}
+
+// List return the pserver list read from etcd.
+func (p *EtcdClient) List() []Server {
+	psDesired := p.Desired()
+
+	servers := make([]Server, psDesired)
+	for {
+		for i := 0; i < psDesired; i++ {
+			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			cancel()
+			psKey := pserver.PsPath + strconv.Itoa(i)
+			log.Debugf("checking %s", psKey)
+			resp, err := p.client.Get(ctx, psKey)
+			if err != nil {
+				log.Infof("Get psKey= %s error, %v", psKey, err)
+				time.Sleep(p.timeout)
+				continue
+			}
+			kvs := resp.Kvs
+			if len(kvs) == 0 {
+				log.Infof("Waiting for ps addr registered ...")
+				time.Sleep(p.timeout)
+				continue
+			}
+
+			psAddr := string(resp.Kvs[0].Value)
+			// TODO(Longfei) check the ps address
+			if psAddr == "" {
+				log.Infof("Get psKey = %s, psAddr is empty", psKey)
+				time.Sleep(p.timeout)
+				continue
+			}
+			log.Infof("got value (%s) for key: %s", psAddr, psKey)
+			servers[i].Index = i
+			servers[i].Addr = psAddr
+		}
+		break
+	}
+	return servers
+}
+
+// NewEtcd create a etcd client to return the state of pserver on etcd.
+func NewEtcd(endpoints string) *EtcdClient {
+	ep := strings.Split(endpoints, ",")
+	var cli *clientv3.Client
+	var err error
+	for {
+		cli, err = clientv3.New(clientv3.Config{
+			Endpoints:   ep,
+			DialTimeout: defaultEtcdTimeout,
+		})
+		if err != nil {
+			log.Errorf("Init etcd connection failed: %v", err)
+			time.Sleep(defaultEtcdTimeout)
+			continue
+		}
+		break
+	}
+	log.Infof("Connected to etcd: %s\n", endpoints)
+	client := &EtcdClient{
+		client:    cli,
+		timeout:   defaultEtcdTimeout,
+		endpoints: ep,
+	}
+	return client
+}
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
-package pserver_test
-
-import (
-	"net"
-	"net/http"
-	"net/rpc"
-	"strconv"
-	"strings"
-	"testing"
-
-	"github.com/PaddlePaddle/Paddle/go/pserver"
-)
-
-const numPserver = 10
-
-var port [numPserver]int
-
-func init() {
-	for i := 0; i < numPserver; i++ {
-		l, err := net.Listen("tcp", ":0")
-		if err != nil {
-			panic(err)
-		}
-
-		ss := strings.Split(l.Addr().String(), ":")
-		p, err := strconv.Atoi(ss[len(ss)-1])
-		if err != nil {
-			panic(err)
-		}
-		port[i] = p
-
-		go func(l net.Listener) {
-			s, err := pserver.NewService(0)
-			if err != nil {
-				panic(err)
-			}
-			server := rpc.NewServer()
-			err = server.Register(s)
-			if err != nil {
-				panic(err)
-			}
-
-			mux := http.NewServeMux()
-			mux.Handle(rpc.DefaultRPCPath, server)
-			err = http.Serve(l, mux)
-			if err != nil {
-				panic(err)
-			}
-		}(l)
-	}
-}
-
-type selector bool
-
-func (s selector) Select() bool {
-	return bool(s)
-}
-
-type lister []pserver.Server
-
-func (l lister) List() []pserver.Server {
-	return l
-}
-
-func TestClientFull(t *testing.T) {
-	servers := make([]pserver.Server, numPserver)
-	for i := 0; i < numPserver; i++ {
-		servers[i] = pserver.Server{Index: i, Addr: ":" + strconv.Itoa(port[i])}
-	}
-	c := pserver.NewClient(lister(servers), len(servers), selector(true))
-	selected := c.BeginInitParams()
-	if !selected {
-		t.Fatal("should be selected.")
-	}
-
-	const numParameter = 100
-	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
-		p.Name = "p_" + strconv.Itoa(i)
-		p.ElementType = pserver.Float32
-		p.Content = make([]byte, (i+1)*100)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p})
-		if err != nil {
-			t.Fatal(err)
-		}
-	}
-
-	err := c.FinishInitParams()
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
-		var g pserver.Gradient
-		g.Name = "p_" + strconv.Itoa(i)
-		g.ElementType = pserver.Float32
-		g.Content = make([]byte, (i+1)*100)
-		grads = append(grads, g)
-	}
-
-	err = c.SendGrads(grads)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	names := make([]string, numParameter)
-	for i := 0; i < numParameter; i++ {
-		names[i] = "p_" + strconv.Itoa(i)
-	}
-
-	params, err := c.GetParams(names)
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if len(names) != len(params) {
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-	}
-
-	for i := range params {
-		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
-		}
-	}
-}
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -13,6 +13,15 @@ import (
 	log "github.com/sirupsen/logrus"
 )

+const (
+	// PsDesired is etcd path for store desired pserver count
+	PsDesired = "/ps_desired"
+	// PsPath is the base dir for pserver to store their addr
+	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
+)
+
 // EtcdClient is the etcd client that the pserver uses for fault
 // tolerance, service registry and coordination.
 type EtcdClient struct {
@@ -40,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
+func (e *EtcdClient) Register(port int) (int, error) {

 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
@@ -68,7 +77,7 @@ func (e *EtcdClient) Register() (int, error) {
 	// it at the same time.
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		_, err := e.initDesiredPsercers(ctx, e.numPservers)
+		_, err := e.initDesiredPservers(ctx, e.numPservers)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -107,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -120,7 +129,7 @@ func (e *EtcdClient) Register() (int, error) {
 	return pserverIdx, nil
 }

-func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
 	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		dsStr := c.Get(PsDesired)
 		if dsStr == "" {
@@ -131,12 +140,12 @@ func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (
 }

 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
 		for i := 0; i < e.desired; i++ {
-			psKey := "/ps/" + strconv.Itoa(i)
+			psKey := PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			ps := c.Get(psKey)
 			log.Debugf("got value (%s) for key: %s", ps, psKey)
@@ -147,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 					log.Fatal(err)
 				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
 				if kaerr != nil {
 					log.Errorf("keepalive etcd node error: %v", kaerr)
@@ -167,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))

 	if err != nil {
@@ -179,3 +189,27 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {

 	return idx, nil
 }
+
+// GetKey gets the value by the specified key
+func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := e.etcdClient.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return []byte{}, err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return []byte{}, nil
+	}
+	v := kvs[0].Value
+	return v, nil
+}
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	_, err := e.etcdClient.Put(ctx, key, string(value))
+	cancel()
+	return err
+}
--- a/go/pserver/optimizer.c
+++ b/go/pserver/optimizer.c
-#include <stdlib.h>
-
-#include "optimizer.h"
-
-typedef int (*update_func)(void*, void*, paddle_element_type, const void*, int);
-typedef void (*release_func)(void*);
-
-typedef struct paddle_optimizer {
-  update_func update;
-  release_func release;
-  void* optimizer;
-} paddle_optimizer;
-
-void paddle_release_optimizer(paddle_optimizer* o) {
-  o->release(o->optimizer);
-  free(o);
-}
-
-int paddle_update_parameter(paddle_optimizer* o,
-                            void* buffer,
-                            paddle_element_type element_type,
-                            const void* gradient,
-                            int num_bytes) {
-  return o->update(o->optimizer, buffer, element_type, gradient, num_bytes);
-}
-
-typedef struct { double learning_rate; } SGD_optimizer;
-
-int update_SGD(void* optimizer,
-               void* buffer,
-               paddle_element_type element_type,
-               const void* gradient,
-               int num_bytes) {
-  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  float* parameter = (float*)buffer;
-  float* grad = (float*)gradient;
-
-  int i;
-  for (i = 0; i < num_bytes / sizeof(float); ++i) {
-    parameter[i] -= o->learning_rate * grad[i];
-  }
-  return 0;
-}
-
-void release_SGD(void* optimizer) {
-  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // nothing allocated on heap
-}
-
-paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate) {
-  SGD_optimizer* impl = (SGD_optimizer*)malloc(sizeof(SGD_optimizer));
-  impl->learning_rate = learning_rate;
-  paddle_optimizer* opt = (paddle_optimizer*)malloc(sizeof(paddle_optimizer));
-  opt->update = update_SGD;
-  opt->release = release_SGD;
-  opt->optimizer = impl;
-  return opt;
-}
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
 package pserver

-/*
-#include "optimizer.h"
-*/
+// #cgo CFLAGS: -I ../../
+// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #include "paddle/optimizer/optimizer.h"
+// #include <stdlib.h>
+// #include <string.h>
 import "C"
+
 import (
 	"fmt"
 	"unsafe"
-)

-type optimizerType int
-
-const (
-	sgd optimizerType = iota
+	log "github.com/sirupsen/logrus"
 )

-var nullPtr = unsafe.Pointer(uintptr(0))
-
 type optimizer struct {
-	opt *C.struct_paddle_optimizer
+	opt         *C.struct_paddle_optimizer
+	elementType ElementType
+	contentLen  int
+}
+
+func cArrayToSlice(p unsafe.Pointer, len int) []byte {
+	if p == nil {
+		return nil
+	}
+
+	// create a Go clice backed by a C array, reference:
+	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
+	//
+	// Go garbage collector will not interact with this data, need
+	// to be freed properly.
+	return (*[1 << 30]byte)(p)[:len:len]
 }

-func newOptimizer(t optimizerType, learning_rate float64) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
-	o.opt = C.paddle_create_SGD_optimizer(C.double(learning_rate))
+	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
+	p := paramWithConfigs.Param
+	c := paramWithConfigs.Config
+	s := State
+	paramBufferSize := C.size_t(len(p.Content))
+	log.WithFields(log.Fields{
+		"ElementType": p.ElementType,
+		"ParamSize":   paramBufferSize,
+		"ConfigSize":  len(c),
+		"StateSize":   len(s),
+	}).Info("New Optimizer Created with config:")
+	var cbuffer unsafe.Pointer
+	cbuffer = C.malloc(paramBufferSize)
+
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
+
+	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
 }

-func (o *optimizer) UpdateParameter(p Parameter, g Gradient) error {
-	if len(p.Content) != len(g.Content) {
-		return fmt.Errorf("Name: %s, parameter and gradient length not match, parameter: %d, gradient: %d", p.Name, len(p.Content), len(g.Content))
+func (o *optimizer) GetWeights() []byte {
+	var buffer unsafe.Pointer
+	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
+	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
+}
+
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+}
+
+func (o *optimizer) UpdateParameter(g Gradient) error {
+	if o.elementType != g.ElementType {
+		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}

-	if p.ElementType != g.ElementType {
-		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", p.Name, p.ElementType, g.ElementType)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
 	}

-	r := C.paddle_update_parameter(o.opt, unsafe.Pointer(&p.Content[0]), C.paddle_element_type(p.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
@@ -44,8 +90,8 @@ func (o *optimizer) UpdateParameter(p Parameter, g Gradient) error {
 }

 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
--- a/go/pserver/optimizer.h
+++ b/go/pserver/optimizer.h
-#ifndef PADDLE_PSERVER_OPTIMIZER_H
-#define PADDLE_PSERVER_OPTIMIZER_H
-
-typedef enum {
-  PADDLE_ELEMENT_TYPE_INT32 = 0,
-  PADDLE_ELEMENT_TYPE_UINT32 = 1,
-  PADDLE_ELEMENT_TYPE_INT64 = 2,
-  PADDLE_ELEMENT_TYPE_UINT64 = 3,
-  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
-  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
-} paddle_element_type;
-
-struct paddle_optimizer;
-struct paddle_optimizer* paddle_create_SGD_optimizer(double learning_rate);
-void paddle_release_optimizer(struct paddle_optimizer* o);
-int paddle_update_parameter(struct paddle_optimizer* o,
-                            void* buffer,
-                            paddle_element_type element_type,
-                            const void* gradient,
-                            int num_bytes);
-
-#endif /* PADDLE_PSERVER_OPTIMIZER_H */
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
 package pserver

-import "testing"
+import (
+	"io/ioutil"
+	"testing"
+)

-func TestSGDCreateRelease(t *testing.T) {
-	o := newOptimizer(sgd, 1)
+func TestOptimizerCreateRelease(t *testing.T) {
+	p := Parameter{
+		Name:        "a",
+		ElementType: Int32,
+	}
+	p.Content = []byte{1, 3}
+	config, err := ioutil.ReadFile("./client/c/test/testdata/optimizer.pb")
+	if err != nil {
+		t.Fatalf("read optimizer proto failed")
+	}
+	param := ParameterWithConfig{
+		Param:  p,
+		Config: config,
+	}
+	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
 package pserver

 import (
+	"bufio"
+	"bytes"
+	"crypto/md5"
+	"encoding/gob"
+	"encoding/hex"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+	"strconv"
 	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
 )

 // ElementType is the type of elements of a Parameter.
 type ElementType int

+// RPC error message.
 const (
-	AlreadyInitialized = "pserver already initialized"
-	Uninitialized      = "pserver not fully initialized"
+	AlreadyInitialized  = "pserver already initialized"
+	Uninitialized       = "pserver not fully initialized"
+	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
 )

-// Supported element types
+// Supported element types.
 const (
 	Int32 ElementType = iota
 	UInt32
@@ -24,9 +39,6 @@ const (
 	Float64
 )

-// PsDesired is etcd path for store desired pserver count
-const PsDesired = "/ps_desired"
-
 // Parameter is a piece of data to sync with the parameter server.
 type Parameter struct {
 	Name        string
@@ -40,28 +52,92 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }

+// checkpointMeta saves checkpoint metadata
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	MD5       string `json:"md5"`
+	Timestamp int64  `json:"timestamp"`
+}
+
+// Checkpoint is the pserver shard persist in file
+type Checkpoint []parameterCheckpoint
+
 // Gradient is the gradient of the parameter.
 type Gradient Parameter

 // Service is the RPC service for pserver.
 type Service struct {
-	initialized chan struct{}
-	idx         int
+	initialized        chan struct{}
+	idx                int
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             *EtcdClient
+	mu                 sync.Mutex
+	optMap             map[string]*optimizer
+}

-	mu       sync.Mutex
-	opt      *optimizer
-	paramMap map[string]Parameter
+// parameterCheckpoint saves parameter checkpoint
+type parameterCheckpoint struct {
+	ParameterWithConfig
+	State []byte
+}
+
+// NewCheckpointFromFile loads parameters and state from checkpoint file
+func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (Checkpoint, error) {
+	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+	if err != nil {
+		return nil, err
+	}
+
+	var cpMeta checkpointMeta
+	if err = json.Unmarshal(v, &cpMeta); err != nil {
+		return nil, err
+	}
+
+	fn := filepath.Join(cpPath, cpMeta.UUID)
+	if _, err = os.Stat(fn); os.IsNotExist(err) {
+		return nil, err
+	}
+	content, err := ioutil.ReadFile(fn)
+	if err != nil {
+		return nil, err
+	}
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(content))
+	if md5 != cpMeta.MD5 {
+		return nil, errors.New(CheckpointMD5Failed)
+	}
+
+	dec := gob.NewDecoder(bytes.NewReader(content))
+	cp := Checkpoint{}
+	if err = dec.Decode(cp); err != nil {
+		return nil, err
+	}
+	return cp, nil
 }

 // NewService creates a new service, will bypass etcd registration if no
-// endpoints specified.
-func NewService(idx int) (*Service, error) {
+// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
+func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
-		idx: idx,
-		opt: newOptimizer(sgd, 0.005),
+		idx:                idx,
+		checkpointInterval: interval,
+		checkpointPath:     path,
+		client:             client,
 	}
-	s.paramMap = make(map[string]Parameter)
+	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
+
+	if cp != nil {
+		for _, item := range cp {
+			p := ParameterWithConfig{
+				Param:  item.Param,
+				Config: item.Config,
+			}
+			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
+		}
+	}
 	return s, nil
 }

@@ -81,7 +157,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.paramMap[paramWithConfigs.Param.Name] = paramWithConfigs.Param
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
 	return nil
 }

@@ -110,12 +186,12 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()

-	p, ok := s.paramMap[g.Name]
+	o, ok := s.optMap[g.Name]
 	if !ok {
 		return fmt.Errorf("parameter: %s does not exist", g.Name)
 	}

-	return s.opt.UpdateParameter(p, g)
+	return o.UpdateParameter(g)
 }

 // GetParam gets parameters from the parameter server.
@@ -124,7 +200,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()

-	p, ok := s.paramMap[name]
+	opt, ok := s.optMap[name]
 	if !ok {
 		return fmt.Errorf("parameter: %s does not exist", name)
 	}
@@ -135,15 +211,89 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
-	*parameter = p
+	// parameter content.
+	parameter.Name = name
+	parameter.ElementType = opt.elementType
+	parameter.Content = opt.GetWeights()
 	return nil
 }

-// Save tells the parameter server to save parameters.
-func (s *Service) Save(path string, dummy *int) error {
+// pserver save checkpoint
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()

-	// TODO
-	return nil
+	cp := make([]parameterCheckpoint, len(s.optMap))
+	index := 0
+	for name, opt := range s.optMap {
+		var pc parameterCheckpoint
+		pc.Param.Name = name
+		pc.Param.ElementType = opt.elementType
+		pc.Param.Content = opt.GetWeights()
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err = encoder.Encode(cp)
+	if err != nil {
+		return
+	}
+
+	cpMeta := checkpointMeta{}
+	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
+	cpMeta.Timestamp = time.Now().UnixNano()
+	h := md5.New()
+	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
+
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
+
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
+	if err != nil {
+		return
+	}
+	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
+		log.Info("checkpoint does not exists.")
+	} else {
+		err = os.Remove(cpMeta.UUID)
+		if err != nil {
+			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
+		} else {
+			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		}
+	}
+	f, err := os.Create(cpMeta.UUID)
+	if err != nil {
+		return
+	}
+
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
+
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	if err != nil {
+		return
+	}
+
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+
+	return
 }
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
 package pserver_test

 import (
+	"io/ioutil"
 	"reflect"
 	"sync"
 	"testing"
@@ -9,8 +10,13 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )

-func TestFull(t *testing.T) {
-	s, err := pserver.NewService(0)
+const (
+	OptimizerConfig = "./client/c/test/testdata/optimizer.pb"
+)
+
+func TestServiceFull(t *testing.T) {
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -18,50 +24,56 @@ func TestFull(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
+	config, err := ioutil.ReadFile(OptimizerConfig)
 	if err != nil {
-		t.FailNow()
+		t.Fatalf("read optimizer proto failed")
+	}
+
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
+	if err != nil {
+		t.Fatal(err)
 	}

 	var p1 pserver.Parameter
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, nil)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}

 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
+
 	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.SendGrad(g2, nil)

 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	// don't compare content, since it's already changed by
@@ -70,36 +82,39 @@ func TestFull(t *testing.T) {
 	p.Content = nil

 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }

 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
-		t.Error(err)
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	err = s.FinishInitParams(0, nil)
 	if err.Error() != pserver.AlreadyInitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }

 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }

 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -117,16 +132,6 @@ func TestBlockUntilInitialized(t *testing.T) {
 		ch <- struct{}{}
 	}()

-	wg.Add(1)
-	go func() {
-		err := s.Save("", nil)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
 	time.Sleep(50 * time.Millisecond)

 	select {
@@ -142,15 +147,24 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
+	config, err := ioutil.ReadFile(OptimizerConfig)
 	if err != nil {
-		t.FailNow()
+		t.Fatalf("read optimizer proto failed")
+	}
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
+
+	if err != nil {
+		t.Fatal(err)
 	}

 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}

 	wg.Wait()
 }
+
+func TestCheckpointSpeed(t *testing.T) {
+	//TODO(zhihong): test speed
+}
--- a/go/utils/networkhelper/CMakeLists.txt
+++ b/go/utils/networkhelper/CMakeLists.txt
+if(WITH_TESTING)
+  go_test(network_helper_test)
+endif()
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,13 +8,14 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
-add_subdirectory(optimizer)
 add_subdirectory(string)

 if(Boost_FOUND)
  add_subdirectory(memory)
  add_subdirectory(platform)
  add_subdirectory(framework)
+  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()

 if(WITH_C_API)

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -66,6 +66,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
    paddle_trainer_lib
    paddle_network
    paddle_parameter
+    paddle_optimizer
    paddle_math
    paddle_utils
    paddle_proto

--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {

 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}

-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }

 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
    void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }

 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}

-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }

 std::string OptimizationConfig::toProtoString() {
  return m->getConfig().SerializeAsString();

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                               bool useSparseUpdater);
  static ParameterUpdater* createNewRemoteUpdater(
      OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
  ~ParameterUpdater();

  /**

--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {

 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}

-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }

 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
  CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
    : m(new ParameterTraverseCallbackPrivate()) {}

-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }

 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                      const ParameterConfig& conf,

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(

 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
    OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
  auto updater = new ParameterUpdater();
  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
  return updater;
 #else
  throw UnsupportError();

--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {

 Vector::Vector() : m(new VectorPrivate()) {}

-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }

 Vector* Vector::createZero(size_t sz, bool useGpu) {
  auto retVec = new Vector();

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
+
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
+
+proto_library(attr_type SRCS attr_type.proto)
+proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
+proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
+cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+
+cc_library(grad_op_creator SRCS grad_op_creator.cc DEPS op_proto operator op_registry)
+cc_library(op_registry SRCS op_registry.cc DEPS op_desc grad_op_creator)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
+cc_test(grad_op_creator_test SRCS grad_op_creator_test.cc DEPS grad_op_creator add_op)
+
+py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+add_dependencies(framework_py_proto framework_py_proto_init)
+
+proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
+# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attr_checker.h
+#pragma once
+
+#include <boost/variant.hpp>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                       std::vector<float>, std::vector<std::string>>
+    Attribute;
+typedef std::unordered_map<std::string, Attribute> AttributeMap;
+
+// check whether a value(attribute) fit a certain limit
+template <typename T>
+class LargerThanChecker {
+ public:
+  LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+// we can provide users more common Checker, like 'LessThanChecker',
+// 'BetweenChecker'...
+
+template <typename T>
+class DefaultValueSetter {
+ public:
+  DefaultValueSetter(T default_value) : default_value_(default_value) {}
+  void operator()(T& value) const { value = default_value_; }
+
+ private:
+  T default_value_;
+};
+
+template <typename T>
+class EnumInContainer {
+ public:
+  explicit EnumInContainer(const std::unordered_set<T>& c) : container_(c) {}
+  void operator()(T& val) const {
+    PADDLE_ENFORCE(container_.find(val) != container_.end(),
+                   "Value %s is not in enum container %s", val,
+                   ContainerDebugString());
+  }
+
+ private:
+  std::string ContainerDebugString() const {
+    std::ostringstream sout;
+    sout << "[";
+    size_t cnt = 0;
+    for (auto& v : container_) {
+      sout << v;
+      ++cnt;
+      if (cnt != container_.size()) {
+        sout << " ,";
+      }
+    }
+    sout << "]";
+    return sout.str();
+  }
+
+  std::unordered_set<T> container_;
+};
+
+// check whether a certain attribute fit its limits
+// an attribute can have more than one limits
+template <typename T>
+class TypedAttrChecker {
+  typedef std::function<void(T&)> ValueChecker;
+
+ public:
+  TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
+
+  TypedAttrChecker& InEnum(const std::unordered_set<T>& range) {
+    value_checkers_.push_back(EnumInContainer<T>(range));
+    return *this;
+  }
+
+  TypedAttrChecker& LargerThan(const T& lower_bound) {
+    value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  // we can add more common limits, like LessThan(), Between()...
+
+  TypedAttrChecker& SetDefault(const T& default_value) {
+    PADDLE_ENFORCE(default_value_setter_.empty(),
+                   "%s can't have more than one default value!", attr_name_);
+    default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
+    return *this;
+  }
+
+  // allow users provide their own checker
+  TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
+    value_checkers_.push_back(checker);
+    return *this;
+  }
+
+  void operator()(AttributeMap& attr_map) const {
+    if (!attr_map.count(attr_name_)) {
+      // user do not set this attr
+      PADDLE_ENFORCE(!default_value_setter_.empty(),
+                     "Attribute '%s' is required!", attr_name_);
+      // default_value_setter_ has no more than one element
+      T val;
+      (default_value_setter_[0])(val);
+      attr_map[attr_name_] = val;
+    }
+    Attribute& attr = attr_map.at(attr_name_);
+    T& attr_value = boost::get<T>(attr);
+    for (const auto& checker : value_checkers_) {
+      checker(attr_value);
+    }
+  }
+
+ private:
+  std::string attr_name_;
+  std::vector<ValueChecker> value_checkers_;
+  std::vector<ValueChecker> default_value_setter_;
+};
+
+// check whether op's all attributes fit their own limits
+class OpAttrChecker {
+  typedef std::function<void(AttributeMap&)> AttrChecker;
+
+ public:
+  template <typename T>
+  TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
+    attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
+    AttrChecker& checker = attr_checkers_.back();
+    return *(checker.target<TypedAttrChecker<T>>());
+  }
+
+  void Check(AttributeMap& attr_map) const {
+    for (const auto& checker : attr_checkers_) {
+      checker(attr_map);
+    }
+  }
+
+ private:
+  std::vector<AttrChecker> attr_checkers_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/attr_type.proto
+++ b/paddle/framework/attr_type.proto
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+syntax="proto2";
+package paddle.framework;
+
+// Attribute Type for paddle's Op.
+// Op contains many attributes. Each type of attributes could be different.
+// The AttrType will be shared between AttrDesc and AttrProto.
+enum AttrType {
+    INT = 0;
+    FLOAT = 1;
+    STRING = 2;
+    INTS = 3;
+    FLOATS = 4;
+    STRINGS = 5;
+}
\ No newline at end of file
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"
+#include "paddle/platform/enforce.h"

 namespace paddle {
 namespace framework {

-///@cond HIDDEN
+/// @cond HIDDEN

 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
  }
 }

-///@endcond
+/// @endcond

 DDim make_ddim(std::initializer_list<int> dims) {
  DDim result(make_dim(0));
@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
  return result;
 }

-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
 public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}

  template <int D>
  int& operator()(Dim<D>& dim) const {
@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {

 class DynamicConstIndexer : public boost::static_visitor<int> {
 public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}

  template <int D>
  int operator()(const Dim<D>& dim) const {
@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
  int idx_;
 };

-///@endcond
+/// @endcond

 int& DDim::operator[](int idx) {
  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }

+ssize_t DDim::size() const { return arity(*this); }
+
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
    return false;
@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }

 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }

-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
  std::vector<int>& vector;

-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}

  template <typename T>
  void operator()(const T& t) {
@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {

  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond

 std::vector<int> vectorize(const DDim& ddim) {
  std::vector<int> result;
@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
  return result;
 }

+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
+
 ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
-  std::vector<int> v = vectorize(ddim);
-  for (auto i : v) {
-    result *= i;
+  ProductVisitor visitor;
+  return boost::apply_visitor(visitor, ddim);
+}
+
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
  }
-  return result;
+
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
 }

-///\cond HIDDEN
+/// \cond HIDDEN

 struct ArityVisitor : boost::static_visitor<int> {
  template <int D>
@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
  }
 };

-///\endcond
+/// \endcond

 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }

-///\cond HIDDEN
+/// \cond HIDDEN

 struct DDimPrinter : boost::static_visitor<void> {
  std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}

  template <typename T>
  void operator()(const T& t) {
@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
  }
 };

-///\endcond
+/// \endcond

 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);
@@ -220,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  return os;
 }

+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once

 #include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
-
 #include "paddle/framework/dim.h"
+#include "paddle/platform/enforce.h"
+#include "unsupported/Eigen/CXX11/Tensor"

 namespace paddle {
 namespace framework {
@@ -27,7 +42,9 @@ struct DDim {
  DDim() : var(Dim<1>()) {}

  template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
+
+  /*implicit*/ DDim(std::initializer_list<int> init_list);

  template <int D>
  DDim& operator=(const Dim<D>& in) {
@@ -57,6 +74,8 @@ struct DDim {
  DDim operator+(DDim d) const;

  DDim operator*(DDim d) const;
+
+  ssize_t size() const;
 };

 /**
@@ -81,6 +100,15 @@ std::vector<int> vectorize(const DDim& ddim);

 ssize_t product(const DDim& ddim);

+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
+
 /**
 * \brief What is the length of this dimension?
 *

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,9 +49,30 @@ TEST(DDim, Equality) {

  // arity of a DDim
  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);

  // product of a DDim
  EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }

 TEST(DDim, Print) {

--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -266,29 +266,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
  return ((0 <= idx.head) && (idx.head < size.head));
 }

-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
 * \brief Compute exclusive prefix-multiply of a Dim.
 */
@@ -306,31 +283,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond

-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
 * Add two dimensions together
 */

--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
 #include <thrust/device_vector.h>
 #include <sstream>

-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"

 __global__ void test(paddle::framework::Dim<2>* o) {
-    o[0] = paddle::framework::make_dim(5, 6);
+  o[0] = paddle::framework::make_dim(5, 6);
 }

 __global__ void dyn_idx_gpu(int* o) {
-    auto d = paddle::framework::make_dim(5, 6);
-    o[0] = d[1];
+  auto d = paddle::framework::make_dim(5, 6);
+  o[0] = d[1];
 }

 TEST(Dim, Equality) {
-    // construct a Dim on the CPU
-    auto a = paddle::framework::make_dim(3, 4);
-    EXPECT_EQ(paddle::framework::get<0>(a), 3);
-    EXPECT_EQ(paddle::framework::get<1>(a), 4);
-
-    // construct a Dim on the GPU
-    thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
-    EXPECT_EQ(paddle::framework::get<0>(a), 5);
-    EXPECT_EQ(paddle::framework::get<1>(a), 6);
-
-    // linearization
-    auto b = paddle::framework::make_dim(7, 8);
-    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-
-    // product
-    EXPECT_EQ(paddle::framework::product(a), 30);
-
-    // mutate a Dim
-    paddle::framework::get<1>(b) = 10;
-    EXPECT_EQ(paddle::framework::get<0>(b), 7);
-    EXPECT_EQ(paddle::framework::get<1>(b), 10);
-
-    // dynamic access
-    paddle::framework::get(b, 0) = 8;
-    b[1] = 11;
-    EXPECT_EQ(paddle::framework::get<0>(b), 8);
-    EXPECT_EQ(paddle::framework::get<1>(b), 11);
-    EXPECT_EQ(paddle::framework::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
-
-    // dynamic access on GPU
-    thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
-    EXPECT_EQ(res, 6);
-
-    // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 12);
-
-    // contiguous_strides
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 0);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 10);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 0);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 6);
-
-    // generate from an index
-    auto size = paddle::framework::make_dim(4, 5, 2);
-    c = paddle::framework::Dim<3>(14, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 2);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::Dim<3>(25, size);
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+  // construct a Dim on the CPU
+  auto a = paddle::framework::make_dim(3, 4);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
+
+  // construct a Dim on the GPU
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
+  a = t[0];
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
+
+  // linearization
+  auto b = paddle::framework::make_dim(7, 8);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+
+  // product
+  EXPECT_EQ(paddle::framework::product(a), 30);
+
+  // mutate a Dim
+  paddle::framework::get<1>(b) = 10;
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
+
+  // dynamic access
+  paddle::framework::get(b, 0) = 8;
+  b[1] = 11;
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(b[1], 11);
+
+  // dynamic access on GPU
+  thrust::device_vector<int> r(1);
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
+  int res = r[0];
+  EXPECT_EQ(res, 6);
+
+  // ex_prefix_mul
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
+
+  // generate from an index
+  auto size = paddle::framework::make_dim(4, 5, 2);
+  c = paddle::framework::Dim<3>(14, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
 }

 TEST(Dim, Bool) {
-    auto a = paddle::framework::make_dim(3, 4);
-    auto b = paddle::framework::make_dim(5, 6);
-    auto c = paddle::framework::make_dim(3, 4);
-
-    // in_bounds check
-    EXPECT_TRUE(paddle::framework::contained(a, b));
-    EXPECT_FALSE(paddle::framework::contained(b, a));
-
-    // comparison
-    EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    paddle::framework::Dim<3> sizef(x, y, z);
-    paddle::framework::Dim<3> stridea(1, x, x*y);
-    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
-    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
+  auto a = paddle::framework::make_dim(3, 4);
+  auto b = paddle::framework::make_dim(5, 6);
+  auto c = paddle::framework::make_dim(3, 4);
+
+  // in_bounds check
+  EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
+
+  // comparison
+  EXPECT_TRUE(a == a);
+  EXPECT_FALSE(a == b);
+  EXPECT_TRUE(a == c);
 }

 TEST(Dim, Print) {
-    {
-        std::stringstream ss;
-        auto a = paddle::framework::make_dim(2, 3);
-        ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
-    }
-    {
-        std::stringstream ss;
-        ss << paddle::framework::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
-    }
+  {
+    std::stringstream ss;
+    auto a = paddle::framework::make_dim(2, 3);
+    ss << a;
+    EXPECT_EQ(ss.str(), "2, 3");
+  }
+  {
+    std::stringstream ss;
+    ss << paddle::framework::make_dim(8);
+    EXPECT_EQ(ss.str(), "8");
+  }
 }
--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/tensor.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace framework {
+
+// EigenDim converts paddle::platform::DDim into Eigen::DSizes.
+template <int D>
+struct EigenDim {
+  using Type = Eigen::DSizes<Eigen::DenseIndex, D>;
+
+  static Type From(const DDim& dims) {
+    PADDLE_ENFORCE(arity(dims) == D, "D must match arity(DDim)");
+    Type ret;
+    for (int d = 0; d < arity(dims); d++) {
+      ret[d] = dims[d];
+    }
+    return ret;
+  }
+};
+
+// Interpret paddle::platform::Tensor as EigenTensor and EigenConstTensor.
+template <typename T, size_t D, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenTensor {
+  // TODO(qijun) Now, default type in unaligned, and we will make a benchmark on
+  // the speed of aligned and unaligned version in future.
+  using Type = Eigen::TensorMap<Eigen::Tensor<T, D, MajorType, IndexType>>;
+
+  using ConstType =
+      Eigen::TensorMap<Eigen::Tensor<const T, D, MajorType, IndexType>>;
+
+  static Type From(Tensor& tensor, DDim dims) {
+    return Type(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static Type From(Tensor& tensor) { return From(tensor, tensor.dims_); }
+
+  static ConstType From(const Tensor& tensor, DDim dims) {
+    return ConstType(tensor.data<T>(), EigenDim<D>::From(dims));
+  }
+
+  static ConstType From(const Tensor& tensor) {
+    return From(tensor, tensor.dims_);
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
+  // Flatten is to reshape a Tensor into a one dimension EigenVector
+  static typename EigenTensor<T, 1>::Type Flatten(Tensor& tensor) {
+    return EigenTensor<T, 1>::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+  }
+
+  static typename EigenTensor<T, 1>::ConstType Flatten(const Tensor& tensor) {
+    return EigenTensor<T, 1>::From(
+        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
+  }
+};
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/eigen_test.cc
+++ b/paddle/framework/eigen_test.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/eigen.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+
+TEST(EigenDim, From) {
+  EigenDim<3>::Type ed = EigenDim<3>::From(make_ddim({1, 2, 3}));
+  ASSERT_EQ(1, ed[0]);
+  ASSERT_EQ(2, ed[1]);
+  ASSERT_EQ(3, ed[2]);
+}
+
+TEST(Eigen, Tensor) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
+
+  ASSERT_EQ(1, et.dimension(0));
+  ASSERT_EQ(2, et.dimension(1));
+  ASSERT_EQ(3, et.dimension(2));
+
+  for (int i = 0; i < 1; i++) {
+    for (int j = 0; j < 2; j++) {
+      for (int k = 0; k < 3; k++) {
+        ASSERT_NEAR((i * 2 + j) * 3 + k, et(i, j, k), 1e-6f);
+      }
+    }
+  }
+}
+
+TEST(Eigen, VectorFrom) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({6}), platform::CPUPlace());
+  for (int i = 0; i < 6; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::From(t);
+
+  ASSERT_EQ(6, ev.dimension(0));
+
+  for (int i = 0; i < 6; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, VectorFlatten) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({1, 2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenVector<float>::Type ev = EigenVector<float>::Flatten(t);
+
+  ASSERT_EQ(1 * 2 * 3, ev.dimension(0));
+
+  for (int i = 0; i < 1 * 2 * 3; i++) {
+    ASSERT_NEAR(i, ev(i), 1e-6f);
+  }
+}
+
+TEST(Eigen, Matrix) {
+  Tensor t;
+  float* p = t.mutable_data<float>(make_ddim({2, 3}), platform::CPUPlace());
+  for (int i = 0; i < 2 * 3; i++) {
+    p[i] = static_cast<float>(i);
+  }
+
+  EigenMatrix<float>::Type em = EigenMatrix<float>::From(t);
+
+  ASSERT_EQ(2, em.dimension(0));
+  ASSERT_EQ(3, em.dimension(1));
+
+  for (int i = 0; i < 2; i++) {
+    for (int j = 0; j < 3; j++) {
+      ASSERT_NEAR(i * 3 + j, em(i, j), 1e-6f);
+    }
+  }
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/enforce.h
+++ b/paddle/framework/enforce.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <paddle/string/printf.h>
-#include <exception>
-#include <sstream>
-
-namespace paddle {
-namespace framework {
-
-/**
- * @brief Enforce exception. Inherits std::exception
- *
- * All enforce condition not met, will throw an EnforceNotMet exception.
- */
-class EnforceNotMet : public std::exception {
- public:
-  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
-    std::ostringstream sout;
-    sout << msg << " at [" << file << ":" << fileline << "];";
-    all_msg_ = sout.str();
-  }
-
-  const char* what() const noexcept override { return all_msg_.c_str(); }
-
- private:
-  std::string all_msg_;
-};
-
-// From https://stackoverflow.com/questions/30130930/
-// __buildin_expect is in C++ 11 standard. Since the condition which enforced
-// should be true in most situation, it will make the compiler generate faster
-// code by adding `UNLIKELY` macro.
-#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-
-/**
- * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
- * __LINE__
- *
- * This macro take __VA_ARGS__, user can pass any type if that type can
- * serialize to std::ostream
- */
-#define PADDLE_THROW(...)                                            \
-  do {                                                               \
-    throw ::paddle::framework::EnforceNotMet(                        \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
-
-/**
- * @brief Enforce a condition, otherwise throw an EnforceNotMet
- */
-#define PADDLE_ENFORCE(condition, ...) \
-  do {                                 \
-    if (UNLIKELY(!(condition))) {      \
-      PADDLE_THROW(__VA_ARGS__);       \
-    }                                  \
-  } while (0)
-
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/framework/grad_op_creator.cc
+++ b/paddle/framework/grad_op_creator.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/grad_op_creator.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+OperatorBase* GradOpCreator::Create() {
+  BuildOpInOutArgList();
+  OperatorBase* grad_op = OpRegistry::grad_creators().at(op_->type_)();
+  CompleteGradOp(grad_op);
+  return grad_op;
+}
+
+OpInOutArg* GradOpCreator::BuildArg(const VarProto& var,
+                                    const VarIndexMap& var_map,
+                                    const std::vector<int>& format,
+                                    InOutType type) {
+  int idx = var_map.at(var.name());
+  int begin_idx = format.empty() ? idx : format.at(idx);
+  int end_idx = format.empty() ? idx + 1 : format.at(idx + 1);
+  return new OpInOutArg(var.name(), type, !var.ignore_gradient(), begin_idx,
+                        end_idx);
+}
+
+void GradOpCreator::BuildOpInOutArgList() {
+  const OpProto& op_proto = OpRegistry::protos().at(op_->type_);
+  const auto& var_map = *(OpRegistry::VarIndexMaps().at(op_->type_));
+  const std::vector<int>& in_format =
+      op_->attrs_.count("input_format")
+          ? op_->GetAttr<std::vector<int>>("input_format")
+          : std::vector<int>();
+  const std::vector<int>& out_format =
+      op_->attrs_.count("output_format")
+          ? op_->GetAttr<std::vector<int>>("output_format")
+          : std::vector<int>();
+  for (const auto& var : op_proto.inputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, in_format, IN)));
+  }
+  for (const auto& var : op_proto.outputs()) {
+    arg_list_.emplace_back(
+        std::shared_ptr<OpInOutArg>(BuildArg(var, var_map, out_format, OUT)));
+  }
+}
+
+void GradOpCreator::AddArgIntoGradOp(const OpInOutArg* arg,
+                                     std::vector<std::string>& in_out,
+                                     std::vector<int>& format,
+                                     VarIndexMap* varmap, int& idx,
+                                     bool is_grad) const {
+  std::string var_name = arg->proto_name_;
+  if (is_grad) {
+    var_name += OperatorBase::GRAD_VAR_SUFFIX();
+  }
+  (*varmap)[var_name] = idx++;
+  size_t pre_sz = in_out.size();
+  auto base_it =
+      arg->type_ == IN ? op_->inputs_.begin() : op_->outputs_.begin();
+  std::copy(base_it + arg->begin_idx_, base_it + arg->end_idx_,
+            std::back_inserter(in_out));
+  if (is_grad) {
+    for (size_t i = pre_sz; i < in_out.size(); ++i) {
+      in_out[i] += OperatorBase::GRAD_VAR_SUFFIX();
+    }
+  }
+  format.push_back(in_out.size());
+}
+
+void GradOpCreator::CompleteGradOp(OperatorBase* grad_op) const {
+  grad_op->type_ = op_->type_ + "@GRAD";  // not necessary
+  grad_op->attrs_ = op_->attrs_;
+  grad_op->attrs_.erase("input_format");
+  grad_op->attrs_.erase("output_format");
+  VarIndexMap* grad_varmap = new VarIndexMap();
+  int in_idx = 0;
+  int out_idx = 0;
+  std::vector<int> in_format({0});
+  std::vector<int> out_format({0});
+  for (const auto& arg : arg_list_) {
+    // op_'s inputs_ and outputs_
+    if (arg->needed_in_grad_) {
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, false);
+    }
+    if (arg->type_ == IN) {
+      // gradients of op_'s inputs_
+      AddArgIntoGradOp(arg.get(), grad_op->outputs_, out_format, grad_varmap,
+                       out_idx, true);
+    } else {
+      // gradients of op_'s outputs_
+      AddArgIntoGradOp(arg.get(), grad_op->inputs_, in_format, grad_varmap,
+                       in_idx, true);
+    }
+  }
+  grad_op->attrs_["input_format"] = in_format;
+  grad_op->attrs_["output_format"] = out_format;
+  grad_op->in_out_idxs_.reset(grad_varmap);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/grad_op_creator.h
+++ b/paddle/framework/grad_op_creator.h
+#pragma once
+
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class OpRegistry;
+
+enum InOutType { IN, OUT };
+
+struct OpInOutArg {
+  OpInOutArg(const std::string& proto_name, const InOutType& type,
+             bool needed_in_grad, size_t begin_idx, size_t end_idx)
+      : proto_name_(proto_name),
+        type_(type),
+        needed_in_grad_(needed_in_grad),
+        begin_idx_(begin_idx),
+        end_idx_(end_idx) {}
+
+  std::string proto_name_;
+  InOutType type_;
+  bool needed_in_grad_;
+  size_t begin_idx_;
+  size_t end_idx_;
+};
+
+class GradOpCreator {
+  using VarIndexMap = std::unordered_map<std::string, int>;
+
+ public:
+  GradOpCreator(const OperatorBase* op) : op_(op) {}
+  OperatorBase* Create();
+
+ private:
+  OpInOutArg* BuildArg(const VarProto& var, const VarIndexMap& var_map,
+                       const std::vector<int>& format, InOutType type);
+  void BuildOpInOutArgList();
+  void AddArgIntoGradOp(const OpInOutArg* arg, std::vector<std::string>& in_out,
+                        std::vector<int>& format, VarIndexMap* varmap, int& idx,
+                        bool is_grad) const;
+  void CompleteGradOp(OperatorBase* grad_op) const;
+  const OperatorBase* op_;
+  std::vector<std::shared_ptr<OpInOutArg>> arg_list_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/grad_op_creator_test.cc
+++ b/paddle/framework/grad_op_creator_test.cc
+#include "paddle/framework/grad_op_creator.h"
+#include <gtest/gtest.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+USE_OP(add_two);
+
+namespace paddle {
+namespace framework {
+
+TEST(GradOpCreator, AddTwo) {
+  OperatorPtr add_op(OpRegistry::CreateOp("add_two", {"x", "y"}, {"out"}, {}));
+  OperatorPtr grad_add_op = OpRegistry::CreateGradOp(add_op);
+  EXPECT_EQ(static_cast<int>(grad_add_op->inputs_.size()), 4);
+  EXPECT_EQ(static_cast<int>(grad_add_op->outputs_.size()), 2);
+  EXPECT_EQ(grad_add_op->Input("X"), "x");
+  EXPECT_EQ(grad_add_op->Input("Y"), "y");
+  EXPECT_EQ(grad_add_op->Input("Out"), "out");
+  EXPECT_EQ(grad_add_op->Input("Out@GRAD"), "out@GRAD");
+  EXPECT_EQ(grad_add_op->Output("X@GRAD"), "x@GRAD");
+  EXPECT_EQ(grad_add_op->Output("Y@GRAD"), "y@GRAD");
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
+  auto grad_ops = std::make_shared<PlainNet>();
+  for (auto& op : ForwardOps->ops_) {
+    auto op_grad = OpRegistry::CreateGradOp(op);
+    grad_ops->AddOp(op_grad);
+  }
+  grad_ops->CompleteAddOp();
+  return grad_ops;
+}
+
+void PlainNet::CompleteAddOp(bool calc) {
+  add_op_done_ = true;
+  if (!calc) return;
+  std::unordered_set<std::string> input_set;
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
+  for (auto& op : ops_) {
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
+  }
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
+    }
+    outputs_.push_back(opt);
+    ++idx;
+  }
+
+  attrs_["temporary_index"] = tmp_index;
+}
+
+std::string PlainNet::DebugString() const {
+  std::ostringstream os;
+  os << this->type_ << ":" << std::endl;
+  for (auto& op : ops_) {
+    os << "\t" << op->DebugString() << std::endl;
+  }
+  return os.str();
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
--- a/paddle/framework/net_design.md
+++ b/paddle/framework/net_design.md
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
--- a/paddle/framework/net_proto.proto
+++ b/paddle/framework/net_proto.proto
+syntax="proto2";
+package paddle.framework;
+
+import "op_proto.proto";
+
+message NetDesc {
+  // network identification
+  optional string name = 1;
+  // operator contains in network
+  repeated OpProto operators = 2;
+  // network type to run with. e.g "plainNet", "DAG"
+  optional string net_type = 3;
+  // num worker always
+  optional int32 num_workers = 4;
+}
--- a/paddle/framework/op_desc.proto
+++ b/paddle/framework/op_desc.proto
--- a/paddle/framework/op_desc_test.cc
+++ b/paddle/framework/op_desc_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/framework/op_desc.pb.h>
+
+TEST(OpDesc, Create) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("add");
+  op_desc.add_inputs("X");
+  op_desc.add_inputs("Y");
+  op_desc.add_outputs("Z");
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  // required field name is not set, so IsInitialized should be false.
+  ASSERT_FALSE(op_desc.IsInitialized());
+
+  attr->set_name("add");
+  // after all required fields are set, IsInitialized should be true now.
+  ASSERT_TRUE(op_desc.IsInitialized());
+}
\ No newline at end of file
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
--- a/paddle/framework/op_proto_test.cc
+++ b/paddle/framework/op_proto_test.cc
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
--- a/paddle/function/CropOp.cpp
+++ b/paddle/function/CropOp.cpp
--- a/paddle/function/CropOp.h
+++ b/paddle/function/CropOp.h
--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
--- a/paddle/function/CropOpTest.cpp
+++ b/paddle/function/CropOpTest.cpp
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
--- a/paddle/gserver/layers/CropLayer.cpp
+++ b/paddle/gserver/layers/CropLayer.cpp
--- a/paddle/gserver/layers/CropLayer.h
+++ b/paddle/gserver/layers/CropLayer.h
--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
--- a/paddle/gserver/layers/DetectionOutputLayer.cpp
+++ b/paddle/gserver/layers/DetectionOutputLayer.cpp
--- a/paddle/gserver/layers/DetectionOutputLayer.h
+++ b/paddle/gserver/layers/DetectionOutputLayer.h
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
--- a/paddle/gserver/layers/MultiBoxLossLayer.cpp
+++ b/paddle/gserver/layers/MultiBoxLossLayer.cpp
--- a/paddle/gserver/layers/MultiBoxLossLayer.h
+++ b/paddle/gserver/layers/MultiBoxLossLayer.h
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
--- a/paddle/gserver/tests/test_DetectionOutput.cpp
+++ b/paddle/gserver/tests/test_DetectionOutput.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
--- a/paddle/operators/sgd_op_test.cc
+++ b/paddle/operators/sgd_op_test.cc
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
--- a/paddle/platform/cuda_test.cu
+++ b/paddle/platform/cuda_test.cu
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
--- a/paddle/platform/dynload/cublas.cc
+++ b/paddle/platform/dynload/cublas.cc
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
--- a/paddle/platform/dynload/curand.cc
+++ b/paddle/platform/dynload/curand.cc
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
--- a/paddle/platform/dynload/dynamic_loader.h
+++ b/paddle/platform/dynload/dynamic_loader.h
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/framework/enforce_test.cc
+++ b/paddle/framework/enforce_test.cc
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
--- a/paddle/scripts/travis/build_android.sh
+++ b/paddle/scripts/travis/build_android.sh
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
--- a/paddle/utils/Stat.h
+++ b/paddle/utils/Stat.h
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_detection_output_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_multibox_loss_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_crop.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_detection_output_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_multibox_loss_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
--- a/python/paddle/v2/dataset/tests/voc2012_test.py
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
--- a/python/paddle/v2/dataset/voc2012.py
+++ b/python/paddle/v2/dataset/voc2012.py
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
--- a/python/paddle/v2/framework/default_scope_funcs.py
+++ b/python/paddle/v2/framework/default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
--- a/python/paddle/v2/framework/tests/test_add_two_op.py
+++ b/python/paddle/v2/framework/tests/test_add_two_op.py
--- a/python/paddle/v2/framework/tests/test_default_scope_funcs.py
+++ b/python/paddle/v2/framework/tests/test_default_scope_funcs.py
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
--- a/python/paddle/v2/framework/tests/test_sgd_op.py
+++ b/python/paddle/v2/framework/tests/test_sgd_op.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/python/setup.py.in
+++ b/python/setup.py.in