diff --git a/.travis.yml b/.travis.yml
index f9b4a7e08315a42a61a58d6c61c45771df962c4d..a53bd1809416d6f14a1ec7f603622d3303d1ab28 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,23 +1,21 @@
 language: cpp
 cache:
   directories:
-    - $HOME/third_party
     - $HOME/.ccache
     - $HOME/.cache/pip
+    - $TRAVIS_BUILD_DIR/build/third_party
 sudo: required
 dist: trusty
 os:
   - linux
 env:
-  - JOB=DOCS
-  - JOB=BUILD_AND_TEST
-  - JOB=PRE_COMMIT
+  - JOB=build_doc
+  - JOB=check_style
 addons:
   apt:
     packages:
       - gcc-4.8
       - g++-4.8
-      - gfortran-4.8
       - git
       - build-essential
       - python
@@ -34,18 +32,7 @@ addons:
       - libtool
       - ccache
 before_install:
-  - |
-    if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
-      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
-        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
-        then
-          echo "Only markdown docs were updated, stopping build process."
-          exit
-        fi
-      fi
-    fi
-  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
@@ -54,8 +41,8 @@ before_install:
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - | 
-    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
+  - |
+    timeout 2580 paddle/scripts/travis/${JOB}.sh  # 43min timeout
     RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c5d7f2c7ec76dcc7befcd16798d26a7d54a19328..24a7066adc57c510030b0926c81849daa4caa6ca 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -27,6 +27,7 @@ if(NOT CMAKE_CROSSCOMPILING)
 endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
+find_package(Boost QUIET)
 
 include(simd)
 
@@ -71,7 +72,7 @@ if(ANDROID)
         "Disable RDMA when cross-compiling for Android" FORCE)
 endif(ANDROID)
 
-set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
   "A path setting third party libraries download & build directories.")
 
 if (WITH_C_API AND WITH_PYTHON)
@@ -92,6 +93,7 @@ include(external/openblas)  # download, build, install openblas
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
+include(external/eigen)     # download eigen3
 
 include(generic)            # simplify cmake module
 include(package)            # set paddle packages
@@ -109,6 +111,7 @@ include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/cclient")
+include_directories(${Boost_INCLUDE_DIRS})
 
 set(EXTERNAL_LIBS
     ${GFLAGS_LIBRARIES}
diff --git a/Dockerfile b/Dockerfile
index 39af60966b6cab7d8b9e644f4ea658613f8ba518..bf227737c5a67b006ccc221235daf6d8ad7b3bd8 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
     apt-get install -y \
     git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils  \
+    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
     curl sed grep graphviz libjpeg-dev zlib1g-dev  \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..45f44f617dcb46062355df4e35d537086215a46d
--- /dev/null
+++ b/cmake/external/eigen.cmake
@@ -0,0 +1,29 @@
+INCLUDE(ExternalProject)
+
+SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
+
+INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/eigen3)
+
+ExternalProject_Add(
+    eigen3
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    # for latest version, please get from official website
+    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
+    # URL_MD5        "1a47e78efe365a97de0c022d127607c3"
+
+    # for no-ssl http support, please get from bazel's mirror
+    # URL           "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
+    # URL_MD5       "4645c66075982da6fa0bcf6b20f3e8f7"
+
+    # get from github mirror
+    GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
+    GIT_TAG         "a46d2e7337c4656f00abe54a8115f6d76153a048"
+    PREFIX          ${EIGEN_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+LIST(APPEND external_project_dependencies eigen3)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 2341e3785bd8e951e10e3f6bbf8a32f63e4ae44d..5b9d9844ed21ceb507a8e01676c3533f4e3dd8fb 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -21,7 +21,8 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
-    SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
+    SET(CBLAS_LIBRARIES
+        "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
     SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_SHARED=1 NO_LAPACK=1 libs)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 7340394b1e1fad9e1893ac87d62febb8dd72751c..2f267adc203f3da80615318f168de9798c537080 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -13,12 +13,53 @@
 # limitations under the License.
 
 INCLUDE(ExternalProject)
+# Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
+FIND_PACKAGE(Protobuf QUIET)
+SET(PROTOBUF_FOUND "OFF")
 
+
+# Print and set the protobuf library information,
+# finish this cmake process and exit from this file.
 macro(PROMPT_PROTOBUF_LIB)
+    SET(protobuf_DEPS ${ARGN})
+
     MESSAGE(STATUS "Protobuf protoc executable: ${PROTOBUF_PROTOC_EXECUTABLE}")
     MESSAGE(STATUS "Protobuf library: ${PROTOBUF_LIBRARY}")
     MESSAGE(STATUS "Protobuf version: ${PROTOBUF_VERSION}")
     INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+    # Assuming that all the protobuf libraries are of the same type.
+    IF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_STATIC_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE STATIC)
+    ELSEIF(${PROTOBUF_LIBRARY} MATCHES "${CMAKE_SHARED_LIBRARY_SUFFIX}$")
+        SET(protobuf_LIBTYPE SHARED)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown library type: ${PROTOBUF_LIBRARY}")
+    ENDIF()
+
+    ADD_LIBRARY(protobuf ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf PROPERTY IMPORTED_LOCATION ${PROTOBUF_LIBRARY})
+
+    ADD_LIBRARY(protobuf_lite ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protobuf_lite PROPERTY IMPORTED_LOCATION ${PROTOBUF_LITE_LIBRARY})
+
+    ADD_LIBRARY(libprotoc ${protobuf_LIBTYPE} IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET libprotoc PROPERTY IMPORTED_LOCATION ${PROTOC_LIBRARY})
+
+    ADD_EXECUTABLE(protoc IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET protoc PROPERTY IMPORTED_LOCATION ${PROTOBUF_PROTOC_EXECUTABLE})
+    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
+    # make `protobuf_generate_cpp` happy.
+    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
+
+    FOREACH(dep ${protobuf_DEPS})
+        ADD_DEPENDENCIES(protobuf ${dep})
+        ADD_DEPENDENCIES(protobuf_lite ${dep})
+        ADD_DEPENDENCIES(libprotoc ${dep})
+        ADD_DEPENDENCIES(protoc ${dep})
+    ENDFOREACH()
+
+    LIST(APPEND external_project_dependencies protobuf)
     RETURN()
 endmacro()
 macro(SET_PROTOBUF_VERSION)
@@ -43,22 +84,23 @@ if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
 endif()
 
 FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
-    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_NAME})
-    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_NAME})
+    STRING(REPLACE "extern_" "" TARGET_DIR_NAME "${TARGET_NAME}")
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/${TARGET_DIR_NAME})
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/${TARGET_DIR_NAME})
 
     SET(${TARGET_NAME}_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
     SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" PARENT_SCOPE)
     SET(${TARGET_NAME}_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${STATIC_LIBRARY_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc${CMAKE_STATIC_LIBRARY_SUFFIX}"
          PARENT_SCOPE)
     SET(${TARGET_NAME}_PROTOC_EXECUTABLE
-        "${PROTOBUF_INSTALL_DIR}/bin/protoc${EXECUTABLE_SUFFIX}"
+        "${PROTOBUF_INSTALL_DIR}/bin/protoc${CMAKE_EXECUTABLE_SUFFIX}"
          PARENT_SCOPE)
 
     SET(OPTIONAL_CACHE_ARGS "")
@@ -109,6 +151,8 @@ IF(NOT CMAKE_CROSSCOMPILING)
         SET_PROTOBUF_VERSION()
         IF("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
             SET(PROTOBUF_FOUND OFF)
+        ELSE()
+            PROMPT_PROTOBUF_LIB()
         ENDIF()
     ENDIF(PROTOBUF_FOUND)
 ELSE()
@@ -120,18 +164,22 @@ ELSE()
 ENDIF()
 
 IF(NOT PROTOBUF_FOUND)
-    build_protobuf(protobuf FALSE)
-    LIST(APPEND external_project_dependencies protobuf)
+    build_protobuf(extern_protobuf FALSE)
 
-    SET(PROTOBUF_INCLUDE_DIR ${protobuf_INCLUDE_DIR}
+    SET(PROTOBUF_INCLUDE_DIR ${extern_protobuf_INCLUDE_DIR}
         CACHE PATH "protobuf include directory." FORCE)
-    IF(NOT CMAKE_CROSSCOMPILING)
-        SET(PROTOBUF_PROTOC_EXECUTABLE ${protobuf_PROTOC_EXECUTABLE}
+    SET(PROTOBUF_LITE_LIBRARY ${extern_protobuf_LITE_LIBRARY}
+        CACHE FILEPATH "protobuf lite library." FORCE)
+    SET(PROTOBUF_LIBRARY ${extern_protobuf_LIBRARY}
+        CACHE FILEPATH "protobuf library." FORCE)
+    SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
+        CACHE FILEPATH "protoc library." FORCE)
+
+    IF(CMAKE_CROSSCOMPILING)
+        PROMPT_PROTOBUF_LIB(protobuf_host extern_protobuf)
+    ELSE()
+        SET(PROTOBUF_PROTOC_EXECUTABLE ${extern_protobuf_PROTOC_EXECUTABLE}
             CACHE FILEPATH "protobuf executable." FORCE)
+        PROMPT_PROTOBUF_LIB(extern_protobuf)
     ENDIF()
-    SET(PROTOBUF_LITE_LIBRARY ${protobuf_LITE_LIBRARY} CACHE FILEPATH "protobuf lite library." FORCE)
-    SET(PROTOBUF_LIBRARY ${protobuf_LIBRARY} CACHE FILEPATH "protobuf library." FORCE)
-    SET(PROTOBUF_PROTOC_LIBRARY ${protobuf_PROTOC_LIBRARY} CACHE FILEPATH "protoc library." FORCE)
 ENDIF(NOT PROTOBUF_FOUND)
-
-PROMPT_PROTOBUF_LIB()
\ No newline at end of file
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 69e8164a00d1fb57b79c63ba88c2846d30d80cd2..61353a4a2622257eddb05578c5085c44c1719b98 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -77,6 +77,18 @@
 # /cmake/external/*.cmake:
 #
 #   cc_test(example_test SRCS example_test.cc DEPS example glog gflags)
+#
+# To build a go static library using Golang, use the go_ prefixed version:
+#
+#   go_library(example STATIC)
+#
+# To build a go shared library using Golang, use the go_ prefixed version:
+#
+#   go_library(example SHARED)
+#
+
+# including binary directory for generated headers.
+include_directories(${CMAKE_BINARY_DIR})
 
 if(NOT APPLE)
     find_package(Threads REQUIRED)
@@ -246,42 +258,53 @@ endfunction(nv_test)
 
 set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
 file(MAKE_DIRECTORY ${GOPATH})
+set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
 
-# Because api.go defines a GO wrapper to ops and tensor, it depends on
-# both.  This implies that if any of tensor.{h,cc}, ops.{h,cu}, or
-# api.go is changed, api need to be re-built.
-# go_library(api
-#   SRCS
-#   api.go
-#   DEPS
-#   tensor # Because ops depend on tensor, this line is optional.
-#   ops)
 function(go_library TARGET_NAME)
-  set(options OPTIONAL)
+  set(options STATIC static SHARED shared)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  if (${go_library_OPTIONAL} STREQUAL "SHARED")
+
+  if (go_library_SHARED OR go_library_shared)
     set(BUILD_MODE "-buildmode=c-shared")
-    if(APPLE)
-      set(LIB_NAME "lib${TARGET_NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${TARGET_NAME}.so")
-    endif()
+    set(LIB_NAME "${CMAKE_SHARED_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}")
   else()
     set(BUILD_MODE "-buildmode=c-archive")
-    set(LIB_NAME "lib${TARGET_NAME}.a")
+    set(LIB_NAME "${CMAKE_STATIC_LIBRARY_PREFIX}${TARGET_NAME}${CMAKE_STATIC_LIBRARY_SUFFIX}")
   endif()
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+
+  # Add dummy code to support `make target_name` under Terminal Command
+  set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+  file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+  if (go_library_SHARED OR go_library_shared)
+    add_library(${TARGET_NAME} SHARED ${dummyfile})
+  else()
+    add_library(${TARGET_NAME} STATIC ${dummyfile})
+  endif()
+  if(go_library_DEPS)
+    add_dependencies(${TARGET_NAME} ${go_library_DEPS})
+  endif(go_library_DEPS)
+
+  # we need to symlink Paddle directory into GOPATH. If we
+  # don't do it and we have code that depends on Paddle, go
+  # get ./... will download a new Paddle repo from Github,
+  # without the changes in our current Paddle repo that we
+  # want to build.
+  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+    # Symlink Paddle directory into GOPATH
+    COMMAND mkdir -p ${PADDLE_IN_GOPATH}
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}                                                                                                                                         
+    COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
+    # Automatically get all dependencies specified in the source code                                                                                                                                 
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ./...
+    # Golang build source code
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${go_library_SRCS}
+    ${GO_SOURCE}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME}_lib ALL DEPENDS ${TARGET_NAME}_timestamp ${go_library_DEPS})
-  add_library(${TARGET_NAME} STATIC IMPORTED)
-  set_property(TARGET ${TARGET_NAME} PROPERTY
-    IMPORTED_LOCATION "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}")
-  add_dependencies(${TARGET_NAME} ${TARGET_NAME}_lib)
 endfunction(go_library)
 
 function(go_binary TARGET_NAME)
@@ -312,9 +335,12 @@ function(go_test TARGET_NAME)
   add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
-# go_extern will download extern go project.
-# go_extern(target_name extern_source)
-# go_extern(go_redis github.com/hoisie/redis)
-function(go_extern TARGET_NAME)
-  add_custom_target(${TARGET_NAME} env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get ${ARGN})
-endfunction(go_extern)
+function(proto_library TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(proto_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(proto_srcs)
+  set(proto_hdrs)
+  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
+  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS protobuf)
+endfunction()
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 904652413e026e3a7f3f2a19f48f4e906ce6babb..adf5e2c539740076ad1808353522c7467d765e64 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -33,6 +33,7 @@ ELSE(WIN32)
             SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
                 "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
         ENDIF()
+        set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
     ELSE(APPLE)
 
         IF(EXISTS "/etc/issue")
@@ -84,24 +85,6 @@ IF(DEFINED CMAKE_SYSTEM_NAME)
     ENDIF()
 ENDIF()
 
-# prefix and suffix on different os
-IF(WIN32)
-    SET(LIBRARY_PREFIX "")
-    SET(SHARED_LIBRARY_SUFFIX ".dll")
-    SET(STATIC_LIBRARY_SUFFIX ".lib")
-    SET(EXECUTABLE_SUFFIX ".exe")
-ELSE(WIN32)
-    SET(LIBRARY_PREFIX "lib")
-    IF(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".dylib")
-    ELSE(APPLE)
-        SET(SHARED_LIBRARY_SUFFIX ".so")
-    ENDIF(APPLE)
-
-    SET(STATIC_LIBRARY_SUFFIX ".a")
-    SET(EXECUTABLE_SUFFIX "")
-ENDIF(WIN32)
-
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 6fa42fd0c71e78cc2fa6b0fe2cb970baf4ac89ed..94dd3457fb5b513441c4c8e339e1862de9092517 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -27,10 +27,6 @@ sphinx_add_target(paddle_docs
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_EN})
 
-add_dependencies(paddle_docs
-  gen_proto_py)
-
-
 # configured documentation tools and intermediate build results
 set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
 
@@ -51,6 +47,3 @@ sphinx_add_target(paddle_docs_cn
                   ${SPHINX_CACHE_DIR_CN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
                   ${SPHINX_HTML_DIR_CN})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst
index 39db51fa4abc370855ca3f2778b47464f33b6fce..9ac972fb193a2fb525edc507f7ba1303d2c8eabe 100644
--- a/doc/api/v2/config/evaluators.rst
+++ b/doc/api/v2/config/evaluators.rst
@@ -99,3 +99,12 @@ value_printer
 ..  automodule:: paddle.v2.evaluator
     :members:  value_printer
     :noindex:
+
+Detection
+=====
+
+detection_map
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  detection_map
+    :noindex:
diff --git a/doc/design/scope.md b/doc/design/scope.md
new file mode 100644
index 0000000000000000000000000000000000000000..afe6bc028cafc5ee24b0041905857af58d3f5790
--- /dev/null
+++ b/doc/design/scope.md
@@ -0,0 +1,124 @@
+# Design of Scope in Paddle
+
+## Overview
+
+Scope is an important concept in programming languages, which defines a program region that a set of bindings between names and entities applies. In a specific scope, a valid name is uniquely associated with an entity, such as a variable. And in another scope, this name may refer to other entity or nothing at all. It clearly restricts the visibility and validity of names in a program. Hence **Scope** is introduced to PaddlePaddle to manage variables in context. But different from the original abstract concept, Scope now becomes an object with two important attributes:
+
+- Scope is an association of a name to variable.
+- Variables in a parent scope can be retrieved from local scope.
+
+A detailed explanation of these two attributes goes as following.
+
+
+## Scope is an association of a name to variable.
+
+Scope is an association of a name to variable. All variables belong to `Scope`. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`. One net can run in different scopes and update different variable in the scope.
+
+
+1. Scope only contains a map of a name to variable.
+
+   All parameters, data, states in a Net should be variables and stored inside a scope. Each op should get inputs and outputs to do computation from a scope, such as data buffer, state(momentum) etc.
+
+1. Variable can only be created by Scope and a variable can only be got from Scope. User cannot create or get a variable outside a scope. This is a constraints of our framework, and will keep our framework simple and clear.
+
+1. Scope only contains methods that are used to Create and Get Variables. Scope do not contain Operators and have no information to run them.
+    `Net` is designed to drive the computation and Scope only contains a map of variables. There is no computation logic inside a `Scope`. Scope just handles the lifetime management of variables.
+    - `Create` is used to create a Variable by its name and add the mapping relation.
+    - `Get` is used to find a Variable by name.
+
+1. Every variable only belongs to one certain Scope.
+
+   Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
+
+1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 
+
+   Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be a invalid pointer when associated `Scope` is destroyed.
+
+```cpp
+class Scope {
+ public:
+  Variable* CreateVariable(const std::string& name);
+  const Variable* GetVariable(const std::string& name) const;
+
+ private:
+    std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+
+
+## Parent scope and local scope
+
+Just like [scope](https://en.wikipedia.org/wiki/Scope_(computer_science)) in programming languages, `Scope` in the neural network can also be a local scope. There are two attributes about local scope.
+
+1.  We can create local variables in a local scope. When that local scope are destroyed, all local variables should also be destroyed.
+2.  Variables in a parent scope can be retrieved from local scopes of that parent scope, i.e., when user get a variable from a scope, it will try to search this variable in current scope. If there is no such variable in the local scope, `scope` will keep searching from its parent, until the variable is found or there is no parent.
+
+```cpp
+class Scope {
+ public:
+  Scope(const std::shared_ptr<Scope>& scope): parent_(scope) {}
+
+  Variable* GetVariable(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->GetVariable(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+ private:
+  std::shared_ptr<Scope> parent_ {nullptr};
+};
+```
+
+In `Scope` class, there is a private data member called `parent_`. `parent_` is a smart pointer to its parent scope. When user `Get` a variable by its `name`, the `name` will be searched inside the current scope. If the variable cannot be found locally and parent scope is not a `nullptr`, the variable will be searched inside that parent scope. `parent_` pointer's default value is `nullptr`. It means that the scope is a global scope when `parent_` is nullptr.
+
+A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
+
+# Interface Design
+
+```cpp
+class Variable {
+ private:
+  Variable() = default;
+  friend class Scope;
+};
+
+class Scope {
+ private:
+  Scope(const std::shared_ptr<Scope>& parent = nullptr);
+
+ public:
+  static std::shared_ptr<Scope> Create(const std::shared_ptr<Scope>& parent = nullptr);
+
+  // return nullptr if not found.
+  Variable* GetVariable(const std::string& name) const;
+
+  // return if already contains same name variable.
+  Variable* CreateVariable(const std::string& name);
+
+ private:
+  std::shared_ptr<Scope> parent_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+};
+```
+## Only scope can create a variable
+
+To ensure `only scope can create a variable`, we should mark `Variable`'s constructor as a private member function, and Scope is a friend class of Variable. And then only `CreateVariable` can construct `Variable`.
+
+## When scope destroyed, all variables inside this scope should be destroyed together
+
+The scope hold unique pointers for all variables. User can `GetVariable` from scope, but he should not hold this pointer as a member variable. Because when scope is destroyed, all variables inside this scope will be destroyed together.
+
+## Sharing a parent scope
+
+Local scope contains a `parent_` pointer. It is a linked-list for scopes. Using a `shared_ptr` because when a local scope is using, its parents cannot be destroyed.
+
+Also, as the parent scope is a `shared_ptr`, we can only `Create()` a scope shared pointer. We cannot construct a scope variable, because it cannot be passed to other scope as `parent` pointer.
+
+## Orthogonal interface
+
+`GetVariable` will return `nullptr` when `name` is not found. It can be used as `Contains` method. `CreateVariable` will return a `Error` when there is a name conflict locally. Combine `GetVariable` and `CreateVariable`, we can implement `CreateOrGetVariable` easily.
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
index e63ca11102c8ce457afcc3c262fa5f159361c01d..f15b11bd780402a3ec1755900e8c648f5d2a7bc5 100644
--- a/doc/getstarted/concepts/use_concepts_cn.rst
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -111,7 +111,7 @@ PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和
     # define training dataset reader
     def train_reader():
         train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
-        train_y = np.array([-2, -3, -7, -7])
+        train_y = np.array([[-2], [-3], [-7], [-7]])
         def reader():
             for i in xrange(train_y.shape[0]):
                 yield train_x[i], train_y[i]
diff --git a/go/cmake/CMakeDetermineGoCompiler.cmake b/go/cmake/CMakeDetermineGoCompiler.cmake
deleted file mode 100644
index a9bb6906c7440782bd648bb7505a548248a11bb0..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeDetermineGoCompiler.cmake
+++ /dev/null
@@ -1,44 +0,0 @@
-if(NOT CMAKE_Go_COMPILER)
-  if(NOT $ENV{GO_COMPILER} STREQUAL "")
-    get_filename_component(CMAKE_Go_COMPILER_INIT $ENV{GO_COMPILER} PROGRAM PROGRAM_ARGS CMAKE_Go_FLAGS_ENV_INIT)
-
-    if(CMAKE_Go_FLAGS_ENV_INIT)
-      set(CMAKE_Go_COMPILER_ARG1 "${CMAKE_Go_FLAGS_ENV_INIT}" CACHE STRING "First argument to Go compiler")
-    endif()
-
-    if(NOT EXISTS ${CMAKE_Go_COMPILER_INIT})
-      message(SEND_ERROR "Could not find compiler set in environment variable GO_COMPILER:\n$ENV{GO_COMPILER}.")
-    endif()
-
-  endif()
-
-  set(Go_BIN_PATH
-    $ENV{GOPATH}
-    $ENV{GOROOT}
-    $ENV{GOROOT}/../bin
-    $ENV{GO_COMPILER}
-    /usr/bin
-    /usr/local/bin
-    )
-
-  if(CMAKE_Go_COMPILER_INIT)
-    set(CMAKE_Go_COMPILER ${CMAKE_Go_COMPILER_INIT} CACHE PATH "Go Compiler")
-  else()
-    find_program(CMAKE_Go_COMPILER
-      NAMES go
-      PATHS ${Go_BIN_PATH}
-    )
-    EXEC_PROGRAM(${CMAKE_Go_COMPILER} ARGS version OUTPUT_VARIABLE GOLANG_VERSION)
-    STRING(REGEX MATCH "go[0-9]+.[0-9]+.[0-9]+[ /A-Za-z0-9]*" VERSION "${GOLANG_VERSION}")
-    message("-- The Golang compiler identification is ${VERSION}")
-    message("-- Check for working Golang compiler: ${CMAKE_Go_COMPILER}")
-  endif()
-
-endif()
-
-mark_as_advanced(CMAKE_Go_COMPILER)
-
-configure_file(${CMAKE_MODULE_PATH}/CMakeGoCompiler.cmake.in
-  ${CMAKE_PLATFORM_INFO_DIR}/CMakeGoCompiler.cmake @ONLY)
-
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoCompiler.cmake.in b/go/cmake/CMakeGoCompiler.cmake.in
deleted file mode 100644
index a71f08e064656fbaad8cfa77aea6f216515712ef..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeGoCompiler.cmake.in
+++ /dev/null
@@ -1,8 +0,0 @@
-set(CMAKE_Go_COMPILER "@CMAKE_Go_COMPILER@")
-set(CMAKE_Go_COMPILER_LOADED 1)
-
-set(CMAKE_Go_SOURCE_FILE_EXTENSIONS go)
-set(CMAKE_Go_LINKER_PREFERENCE 40)
-set(CMAKE_Go_OUTPUT_EXTENSION .o)
-set(CMAKE_Go_OUTPUT_EXTENSION_REPLACE 1)
-set(CMAKE_Go_COMPILER_ENV_VAR "GO_COMPILER")
diff --git a/go/cmake/CMakeGoInformation.cmake b/go/cmake/CMakeGoInformation.cmake
deleted file mode 100644
index ba51ac93fcd429478f324b66bd5129d94ea2a8f4..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeGoInformation.cmake
+++ /dev/null
@@ -1,7 +0,0 @@
-if(NOT CMAKE_Go_COMPILE_OBJECT)
-  set(CMAKE_Go_COMPILE_OBJECT "go tool compile -l -N -o <OBJECT> <SOURCE> ")
-endif()
-
-if(NOT CMAKE_Go_LINK_EXECUTABLE)
-  set(CMAKE_Go_LINK_EXECUTABLE "go tool link -o <TARGET> <OBJECTS>  ")
-endif()
diff --git a/go/cmake/CMakeTestGoCompiler.cmake b/go/cmake/CMakeTestGoCompiler.cmake
deleted file mode 100644
index b9891b015baced05b51e34dba562fd98a84fe14c..0000000000000000000000000000000000000000
--- a/go/cmake/CMakeTestGoCompiler.cmake
+++ /dev/null
@@ -1 +0,0 @@
-set(CMAKE_Go_COMPILER_WORKS 1 CACHE INTERNAL "")
diff --git a/go/cmake/flags.cmake b/go/cmake/flags.cmake
deleted file mode 100644
index a167c432a920e9ee93878603f3b946e8593412f6..0000000000000000000000000000000000000000
--- a/go/cmake/flags.cmake
+++ /dev/null
@@ -1,45 +0,0 @@
-# Setting Paddle Compile Flags
-include(CheckCXXCompilerFlag)
-include(CheckCCompilerFlag)
-include(CheckCXXSymbolExists)
-include(CheckTypeSize)
-
-function(CheckCompilerCXX11Flag)
-    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
-            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
-        endif()
-    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
-        # Apple Clang is a different compiler than upstream Clang which havs different version numbers.
-        # https://gist.github.com/yamaya/2924292
-        if(APPLE)  # cmake < 3.0 compiler id "Clang" on Mac OS X
-            if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 5.1)
-                message(FATAL_ERROR "Unsupported AppleClang version. AppleClang >= 5.1 required.")
-            endif()
-        else()
-            if (${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 3.3)
-                message(FATAL_ERROR "Unsupported Clang version. Clang >= 3.3 required.")
-            endif()
-        endif()
-    endif()
-endfunction()
-
-CheckCompilerCXX11Flag()
-set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
-
-# Common gpu architectures: Kepler, Maxwell
-foreach(capability 30 35 50)
-      list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
-endforeach()
-
-if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
-      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
-endif()
-
-# Modern gpu architectures: Pascal
-if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
-      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
-endif()
-
-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake
deleted file mode 100644
index a5a43886f887e495500fa26b3c26fa69c63eded0..0000000000000000000000000000000000000000
--- a/go/cmake/golang.cmake
+++ /dev/null
@@ -1,48 +0,0 @@
-set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
-file(MAKE_DIRECTORY ${GOPATH})
-set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle")
-file(MAKE_DIRECTORY ${PADDLE_IN_GOPATH})
-
-function(GO_LIBRARY NAME BUILD_TYPE)
-  if(BUILD_TYPE STREQUAL "STATIC")
-    set(BUILD_MODE -buildmode=c-archive)
-    set(LIB_NAME "lib${NAME}.a")
-  else()
-    set(BUILD_MODE -buildmode=c-shared)
-    if(APPLE)
-      set(LIB_NAME "lib${NAME}.dylib")
-    else()
-      set(LIB_NAME "lib${NAME}.so")
-    endif()
-  endif()
-
-  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
-
-  # find Paddle directory.
-  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-  get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-  get_filename_component(PADDLE_DIR ${PARENT_DIR} DIRECTORY)
-
-  # automatically get all dependencies specified in the source code
-  # for given target.
-  add_custom_target(${NAME}_goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
-
-  # make a symlink that references Paddle inside $GOPATH, so go get
-  # will use the local changes in Paddle rather than checkout Paddle
-  # in github.
-  add_custom_target(${NAME}_copyPaddle
-    COMMAND rm -rf ${PADDLE_IN_GOPATH}/Paddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH}/Paddle)
-  add_dependencies(${NAME}_goGet ${NAME}_copyPaddle)
-
-  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
-    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-
-  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
-  add_dependencies(${NAME} ${NAME}_goGet)
-
-endfunction(GO_LIBRARY)
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 25cd1cafcdf328094a019638f37f908591f5f374..54fa254863156455f66fa87de9077042a45f9735 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -1,45 +1,69 @@
 package main
 
 import (
+	"fmt"
 	"net"
 	"net/http"
 	"net/rpc"
 	"strconv"
+	"strings"
 	"time"
 
 	"github.com/namsral/flag"
+	log "github.com/sirupsen/logrus"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
 )
 
 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-
-	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
+	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
+	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
 	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
 	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
 	flag.Parse()
 
-	if *faultTolerance {
-		panic("fault tolernance not implemented.")
+	if *endpoints == "" {
+		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
+	}
+
+	var store master.Store
+	if *endpoints != "" {
+		eps := strings.Split(*endpoints, ",")
+		ip, err := networkhelper.GetExternalIP()
+		if err != nil {
+			log.Fatal(err)
+		}
 
+		addr := fmt.Sprintf("%s:%d", ip, *port)
+		store, err = master.NewEtcdClient(eps, addr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, *ttlSec)
+		if err != nil {
+			log.Fatal(err)
+		}
+	} else {
+		store = &master.InMemStore{}
+	}
+
+	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	if err != nil {
+		log.Fatal(err)
 	}
 
-	s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
-	err := rpc.Register(s)
+	err = rpc.Register(s)
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 
 	err = http.Serve(l, nil)
 	if err != nil {
-		panic(err)
+		log.Fatal(err)
 	}
 }
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index f0be251c2471cc9ddc069f040417b5181a78c058..8a42d4f8af1713e246f9efaf5dc7ba878c3b271e 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -5,18 +5,42 @@ import (
 	"net/http"
 	"net/rpc"
 	"strconv"
+	"time"
 
 	"github.com/namsral/flag"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )
 
 func main() {
 	port := flag.Int("port", 0, "port of the pserver")
+	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
+		"comma separated endpoint string for pserver to connect to etcd")
+	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
-	s := pserver.NewService()
-	err := rpc.Register(s)
+	level, err := log.ParseLevel(*logLevel)
+	if err != nil {
+		panic(err)
+	}
+	log.SetLevel(level)
+
+	timeout := time.Second * time.Duration((*etcdTimeout))
+	e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+	idx, err := e.Register()
+	if err != nil {
+		panic(err)
+	}
+
+	s, err := pserver.NewService(idx)
+	if err != nil {
+		panic(err)
+	}
+	err = rpc.Register(s)
 	if err != nil {
 		panic(err)
 	}
@@ -27,7 +51,9 @@ func main() {
 		panic(err)
 	}
 
+	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
+
 	if err != nil {
 		panic(err)
 	}
diff --git a/go/master/c/client.go b/go/master/c/client.go
index b186474dc33138aeb02a2ffe34418b379b7a2db0..9e35e986002c0ae3b7593150ece96dba29a1521b 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -13,10 +13,13 @@ typedef int paddle_master_client;
 import "C"
 
 import (
+	"strings"
 	"sync"
+	"time"
 	"unsafe"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
 	log "github.com/sirupsen/logrus"
 )
 
@@ -48,16 +51,33 @@ func remove(client C.paddle_master_client) *master.Client {
 	return h
 }
 
-type addresser string
-
-func (a addresser) Address() string {
-	return string(a)
+//export paddle_new_etcd_master_client
+func paddle_new_etcd_master_client(etcdEndpoints *C.char, timeout int, bufSize int) C.paddle_master_client {
+	p := C.GoString(etcdEndpoints)
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   strings.Split(p, ","),
+		DialTimeout: time.Second * time.Duration(timeout),
+	})
+	if err != nil {
+		panic(err)
+	}
+	ch := make(chan string, 1)
+	a, err := master.GetKey(cli, master.DefaultAddrPath, timeout)
+	if err != nil {
+		panic(err)
+	}
+	ch <- a
+	go master.WatchKey(cli, master.DefaultAddrPath, ch)
+	c := master.NewClient(ch, bufSize)
+	return add(c)
 }
 
 //export paddle_new_master_client
 func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
 	a := C.GoString(addr)
-	c := master.NewClient(addresser(a), bufSize)
+	ch := make(chan string, 1)
+	ch <- a
+	c := master.NewClient(ch, bufSize)
 	return add(c)
 }
 
diff --git a/go/master/client.go b/go/master/client.go
index 8451820c1963dd5a4eff0c3ab7763eb6a8e05ba4..d3bea49d0a8166420e83478076cc7bc81e48598d 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,18 +2,12 @@ package master
 
 import (
 	"os"
-	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
 	log "github.com/sirupsen/logrus"
 )
 
-// Addresser provide the address of the master server.
-type Addresser interface {
-	Address() string
-}
-
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
@@ -24,11 +18,11 @@ type Client struct {
 //
 // bufSize is the record buffer size. NextRecord will read from this
 // buffer.
-func NewClient(addr Addresser, bufSize int) *Client {
+func NewClient(addrCh <-chan string, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
 	c.ch = make(chan []byte, bufSize)
-	go c.monitorMaster(addr)
+	go c.monitorMaster(addrCh)
 	go c.getRecords()
 	return c
 }
@@ -72,12 +66,10 @@ func (c *Client) getRecords() {
 	}
 }
 
-func (c *Client) monitorMaster(addr Addresser) {
+func (c *Client) monitorMaster(addrCh <-chan string) {
 	lastMaster := ""
-	monitor := func() {
-		// get the lastest address of the master server,
+	for curMaster := range addrCh {
 		// connect to the new address once address changed.
-		curMaster := addr.Address()
 		if curMaster != lastMaster {
 			if curMaster == "" {
 				err := c.conn.Close()
@@ -94,18 +86,10 @@ func (c *Client) monitorMaster(addr Addresser) {
 					// to retry next time.
 					curMaster = lastMaster
 				}
-
 			}
 		}
-
 		lastMaster = curMaster
 	}
-
-	monitor()
-	ticker := time.NewTicker(10 * time.Second)
-	for _ = range ticker.C {
-		monitor()
-	}
 }
 
 // SetDataset set dataset for the master server to dispatch.
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 00fcca0e2cf44d0f4855fd366a8f80895abf8865..364dce7b58cf6366af711bde9107559a762563a4 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -26,12 +26,6 @@ func init() {
 	log.SetLevel(log.ErrorLevel)
 }
 
-type TestAddresser string
-
-func (a TestAddresser) Address() string {
-	return string(a)
-}
-
 func TestGetFinishTask(t *testing.T) {
 	const path = "/tmp/master_client_test_0"
 
@@ -45,11 +39,14 @@ func TestGetFinishTask(t *testing.T) {
 	if err != nil {
 		panic(err)
 	}
-
 	go func(l net.Listener) {
-		s := NewService(chunkPerTask, time.Second, 1)
+		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if err != nil {
+			panic(err)
+		}
+
 		server := rpc.NewServer()
-		err := server.Register(s)
+		err = server.Register(s)
 		if err != nil {
 			panic(err)
 		}
@@ -78,9 +75,11 @@ func TestGetFinishTask(t *testing.T) {
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
 	c.conn = connection.New()
-	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	addr := fmt.Sprintf(":%d", p)
+	ch := make(chan string, 1)
+	ch <- addr
+	go c.monitorMaster(ch)
 	c.SetDataset([]string{path})
-
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
diff --git a/go/master/client_test.go b/go/master/client_test.go
index 2b3f873ecf3a650cd91d1d9c20b414b05bbb0cd6..c00aeebfd5d1fef6de4a8c67bf7f998a42ee863b 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -20,7 +20,6 @@ func TestNextRecord(t *testing.T) {
 		path  = "/tmp/master_client_TestFull"
 		total = 50
 	)
-
 	l, err := net.Listen("tcp", ":0")
 	if err != nil {
 		panic(err)
@@ -31,11 +30,14 @@ func TestNextRecord(t *testing.T) {
 	if err != nil {
 		panic(err)
 	}
-
 	go func(l net.Listener) {
-		s := master.NewService(10, time.Second, 1)
+		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		if err != nil {
+			panic(err)
+		}
+
 		server := rpc.NewServer()
-		err := server.Register(s)
+		err = server.Register(s)
 		if err != nil {
 			panic(err)
 		}
@@ -59,10 +61,10 @@ func TestNextRecord(t *testing.T) {
 	}
 	w.Close()
 	f.Close()
-
-	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
+	curAddr := make(chan string, 1)
+	curAddr <- fmt.Sprintf(":%d", p)
+	c := master.NewClient(curAddr, 10)
 	c.SetDataset([]string{path})
-
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
new file mode 100644
index 0000000000000000000000000000000000000000..e27c014792f31ca27fe1a1636d69acccc4206ea3
--- /dev/null
+++ b/go/master/etcd_client.go
@@ -0,0 +1,172 @@
+package master
+
+import (
+	"context"
+	"time"
+
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
+)
+
+const (
+	// DefaultLockPath is the default etcd master lock path.
+	DefaultLockPath = "/master/lock"
+	// DefaultStatePath is the default etcd key for master state.
+	DefaultStatePath = "/master/state"
+	// DefaultAddrPath is the default etcd key for master address.
+	DefaultAddrPath = "/master/addr"
+)
+
+// EtcdClient is the etcd client that the master uses for fault
+// tolerance and service registry.
+type EtcdClient struct {
+	lockPath  string
+	statePath string
+	client    *clientv3.Client
+	lock      *concurrency.Mutex
+}
+
+// NewEtcdClient creates a new EtcdClient.
+func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
+	log.Debugf("Connecting to etcd at %v", endpoints)
+	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// store holds a etcd lock, even though the lock will expire
+	// when the lease timeout, we need to implement graceful
+	// shutdown to release the lock.
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   endpoints,
+		DialTimeout: dialTimeout,
+	})
+	if err != nil {
+		return nil, err
+	}
+
+	sess, err := concurrency.NewSession(cli, concurrency.WithTTL(ttlSec))
+	if err != nil {
+		return nil, err
+	}
+
+	lock := concurrency.NewMutex(sess, lockPath)
+	// It's fine for the lock to get stuck, in this case we have
+	// multiple master servers running (only configured to have
+	// one master running, but split-brain problem may cuase
+	// multiple master servers running), and the cluster management
+	// software will kill one of them.
+	log.Debugf("Trying to acquire lock at %s.", lockPath)
+	err = lock.Lock(context.TODO())
+	if err != nil {
+		return nil, err
+	}
+	log.Debugf("Successfully acquired lock at %s.", lockPath)
+
+	put := clientv3.OpPut(addrPath, string(addr))
+	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Fatal("No longer owns the master lock. Exiting.")
+	}
+
+	e := &EtcdClient{
+		lockPath:  lockPath,
+		statePath: statePath,
+		client:    cli,
+		lock:      lock,
+	}
+
+	return e, nil
+}
+
+// Save saves the state into the etcd.
+func (e *EtcdClient) Save(state []byte) error {
+	ctx := context.TODO()
+	put := clientv3.OpPut(e.statePath, string(state))
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	if err != nil {
+		return err
+	}
+
+	if !resp.Succeeded {
+		log.Errorln("No longer owns the lock, trying to lock again")
+		ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
+		err := e.lock.Lock(ctx)
+		cancel()
+		if err != nil {
+			// We lost the master lock and can not acquire
+			// it back, it means some other master is
+			// already started. We don't want cluster
+			// managment system to kill the master server
+			// who is holding the lock and running
+			// correctly. So the most feasible solution is
+			// to kill current master server. The current
+			// state is not saved, but the trainer's RPC
+			// call will fail, so the trainer will retry.
+			log.Fatalf("Could not acquire the lock at %s: %v. Exiting.", e.lockPath, err)
+		}
+		log.Infof("Successfully acquired lock at %s.", e.lockPath)
+		return e.Save(state)
+	}
+
+	return nil
+}
+
+// Load loads the state from etcd.
+func (e *EtcdClient) Load() ([]byte, error) {
+	ctx := context.TODO()
+	get := clientv3.OpGet(e.statePath)
+
+	resp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(get).Commit()
+	if err != nil {
+		return nil, err
+	}
+
+	if !resp.Succeeded {
+		log.Errorln("No longer owns the lock, trying to lock and load again.")
+		err = e.lock.Lock(context.Background())
+		if err != nil {
+			return nil, err
+		}
+
+		return e.Load()
+	}
+
+	kvs := resp.Responses[0].GetResponseRange().Kvs
+	if len(kvs) == 0 {
+		// No state exists
+		return nil, nil
+	}
+
+	state := kvs[0].Value
+	return state, nil
+}
+
+// GetKey gets the value by the specify key.
+func GetKey(c *clientv3.Client, key string, timeout int) (string, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	resp, err := c.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return "", err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return "", nil
+	}
+	v := kvs[0].Value
+	return string(v), nil
+}
+
+// WatchKey watches the specify key and send to valChan if there is some event.
+func WatchKey(c *clientv3.Client, key string, valChan chan<- string) {
+	rch := c.Watch(context.Background(), key)
+	for wresp := range rch {
+		for _, ev := range wresp.Events {
+			// if received event is DELETE, the value will be an empty string
+			log.Infof("received event %s, %q : %q\n", ev.Type, ev.Kv.Key, ev.Kv.Value)
+			valChan <- string(ev.Kv.Value)
+		}
+	}
+}
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
new file mode 100644
index 0000000000000000000000000000000000000000..bcd549b20e46381783bad11caa08cb7f4ba40add
--- /dev/null
+++ b/go/master/inmem_store.go
@@ -0,0 +1,28 @@
+package master
+
+import "sync"
+
+// InMemStore is an in memory implementation of Store interface.
+//
+// It does not tolerate the fault that casues the program to crash.
+type InMemStore struct {
+	mu  sync.Mutex
+	buf []byte
+}
+
+// Save saves the state into the in-memory store.
+func (m *InMemStore) Save(state []byte) error {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	m.buf = state
+	return nil
+}
+
+// Load loads the state from the in-memory store.
+func (m *InMemStore) Load() ([]byte, error) {
+	m.mu.Lock()
+	defer m.mu.Unlock()
+
+	return m.buf, nil
+}
diff --git a/go/master/service.go b/go/master/service.go
index 55e1e2d1a4a5cd6f5d5797b247e2ebe433607576..58e68e744859933aa74cac231356d4ff9dfb8d7b 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -1,6 +1,9 @@
 package master
 
 import (
+	"bytes"
+	"compress/gzip"
+	"encoding/gob"
 	"errors"
 	"os"
 	"path/filepath"
@@ -12,24 +15,54 @@ import (
 	"github.com/PaddlePaddle/recordio"
 )
 
+const (
+	dialTimeout = 5 * time.Second
+)
+
+// Store is the interface for save and load the master state.
+type Store interface {
+	Save([]byte) error
+	Load() ([]byte, error)
+}
+
+// Chunk is a chunk of data consisted of several data instances.
+type Chunk struct {
+	Path  string
+	Index recordio.Index // chunk index
+}
+
+// Task is the basic unit of data instances assigned to trainers.
+type Task struct {
+	ID     int
+	Chunks []Chunk
+}
+
+type taskEntry struct {
+	Epoch      int
+	NumTimeout int
+	Task       Task
+}
+
+type taskQueues struct {
+	Todo    []taskEntry
+	Pending map[int]taskEntry // map from task ID to task entry
+	Done    []taskEntry
+	Failed  []Task
+}
+
 // Service is the master server service.
 type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
 	timeoutMax    int
 	ready         chan struct{}
+	store         Store
 
 	mu         sync.Mutex
 	initDone   bool
 	taskQueues taskQueues
 }
 
-// Recover recovers service state from etcd.
-func Recover() (*Service, error) {
-	// TODO(helin): recover from snapshot state from etcd.
-	return nil, nil
-}
-
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	id := 0
 	if chunksPerTask <= 0 {
@@ -58,7 +91,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
@@ -66,38 +99,82 @@ func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Se
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
-	return s
-}
+	s.store = store
+	recovered, err := s.recover()
+	if err != nil {
+		return nil, err
+	}
 
-// Chunk is a chunk of data consisted of several data instances.
-type Chunk struct {
-	Path  string
-	Index recordio.Index // chunk index
-}
+	if recovered {
+		// Recovered. Now the state is already initialized,
+		// and the master is ready.
+		s.initDone = true
+		close(s.ready)
+		log.Info("Master recovered from saved state.")
+	}
 
-// Task is the basic unit of data instances assigned to trainers.
-type Task struct {
-	ID     int
-	Chunks []Chunk
+	return s, nil
 }
 
-type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
-}
+// recover recovers service state from etcd.
+func (s *Service) recover() (bool, error) {
+	state, err := s.store.Load()
+	if err != nil {
+		return false, err
+	}
 
-type taskQueues struct {
-	Todo    []taskEntry
-	Pending map[int]taskEntry // map from task ID to task entry
-	Done    []taskEntry
-	Failed  []Task
+	if state == nil {
+		log.Infoln("No state exists, not recovered.")
+		return false, nil
+	}
+
+	log.Infof("Loaded snapshot of size: %d bytes.", len(state))
+	gr, err := gzip.NewReader(bytes.NewReader(state))
+	if err != nil {
+		return false, err
+	}
+
+	dec := gob.NewDecoder(gr)
+	var tqs taskQueues
+	err = dec.Decode(&tqs)
+	if err != nil {
+		return false, err
+	}
+
+	err = gr.Close()
+	if err != nil {
+		// Only close failed, recover actually succeed, so
+		// just log error.
+		log.Errorln(err)
+	}
+
+	s.taskQueues = tqs
+	return true, nil
 }
 
-// *must* be called with s.mu being held.
+// snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TODO(helin): snapshot state on etcd.
-	return nil
+	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// size is limited by the max request size. We should either
+	// divide the snapshot into smaller chunks and save under
+	// different keys, or configure the request size to be big
+	// enough:
+	// https://github.com/coreos/etcd/blob/2f84f3d8d8ed8f9537ab6ffa44a3a1c7eddfa9b1/embed/config.go#L44
+	var buf bytes.Buffer
+	gw := gzip.NewWriter(&buf)
+	enc := gob.NewEncoder(gw)
+	err := enc.Encode(s.taskQueues)
+	if err != nil {
+		return err
+	}
+	err = gw.Close()
+	if err != nil {
+		return err
+	}
+
+	state := buf.Bytes()
+	log.Infof("Saving snapshot of size: %d bytes.", len(state))
+	return s.store.Save(state)
 }
 
 func readChunks(globPaths []string) ([]Chunk, error) {
@@ -207,12 +284,12 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 
 		t.NumTimeout++
 		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v timed out %d times, discard.\n", t.Task, t.NumTimeout)
+			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
 			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
 			return
 		}
 
-		log.Warningf("Task %v timed out %d times, retry.\n", t.Task, t.NumTimeout)
+		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
 		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
 	}
 }
diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/cclient/CMakeLists.txt
index b3e79ca661d0832821628b7cc6b540e17db45118..e12cf880683958bff54e541f7c391b8c1e1281de 100644
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -1,16 +1,4 @@
-cmake_minimum_required(VERSION 3.0)
-
-get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-
-project(cxx_go C Go)
-
-include(golang)
-include(flags)
-
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags)
-
 go_library(paddle_pserver_cclient STATIC)
 if(WITH_TESTING)
   add_subdirectory(test)
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go
index 92a41b7f5434842c6318704dd85adf9e51c19944..bbaf43d9f1434a278568bc110a709718b9b8c222 100644
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
@@ -133,7 +133,7 @@ func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter,
 
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -200,7 +200,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		for i, p := range ps {
 			pn[i] = p.Name
 		}
-		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
 		return C.PSERVER_ERROR
 	}
 
@@ -210,7 +210,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 			for i, p := range ps {
 				pn[i] = p.Name
 			}
-			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.", strings.Join(pn, ", "), strings.Join(ns, ", "))
 			return C.PSERVER_ERROR
 		}
 	}
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
index 722bd45d2f0838ea2d907004808052c21ebe8bd6..eddce640b5888060b2bb4cfed8a446185c745fc2 100644
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -1,18 +1 @@
-cmake_minimum_required(VERSION 3.0)
-
-add_executable(test_cclient test_cclient.c)
-add_dependencies(test_cclient paddle_pserver_cclient)
-
-if(APPLE)
-  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
-else()
-  set(CMAKE_EXE_LINKER_FLAGS "-pthread")
-endif()
-
-if(PROJ_ROOT)
-  include_directories(${CMAKE_CURRENT_BINARY_DIR}/..)
-  target_link_libraries(test_cclient ${CMAKE_CURRENT_BINARY_DIR}/../libpaddle_pserver_cclient.a pthread)
-else(PROJ_ROOT)
-  include_directories(${CMAKE_BINARY_DIR})
-  target_link_libraries(test_cclient ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
-endif(PROJ_ROOT)
+cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient)
\ No newline at end of file
diff --git a/go/pserver/client.go b/go/pserver/client.go
index dda915977282d4880ddcc8c18ef6fd80ede9e01b..6938b9d5ce6f6d73c05bd6e3154777023965c319 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -1,6 +1,7 @@
 package pserver
 
 import (
+	"errors"
 	"hash/fnv"
 	"sort"
 	"time"
@@ -123,6 +124,9 @@ func (c *Client) FinishInitParams() error {
 // SendGrads sends gradients to parameter servers for updating
 // parameters.
 func (c *Client) SendGrads(grads []Gradient) error {
+	if len(grads) == 0 {
+		return errors.New("no gradient received")
+	}
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
index d746bf3f26949551778194c8659575a3198e112d..a248a3fb696a1e03b799f89afceb255de68662b1 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -31,9 +31,12 @@ func init() {
 		port[i] = p
 
 		go func(l net.Listener) {
-			s := pserver.NewService()
+			s, err := pserver.NewService(0)
+			if err != nil {
+				panic(err)
+			}
 			server := rpc.NewServer()
-			err := server.Register(s)
+			err = server.Register(s)
 			if err != nil {
 				panic(err)
 			}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
new file mode 100644
index 0000000000000000000000000000000000000000..4d88243edd4aa817ddc263ba316a3f6be9e1e67f
--- /dev/null
+++ b/go/pserver/etcd_client.go
@@ -0,0 +1,181 @@
+package pserver
+
+import (
+	"context"
+	"errors"
+	"strconv"
+	"strings"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
+	log "github.com/sirupsen/logrus"
+)
+
+// EtcdClient is the etcd client that the pserver uses for fault
+// tolerance, service registry and coordination.
+type EtcdClient struct {
+	numPservers   int
+	etcdEndpoints string
+	etcdClient    *clientv3.Client
+	// etcdTimeout is also used as retry intervals.
+	etcdTimeout time.Duration
+	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
+	externalIP string
+	// desired number of pservers in the job.
+	// assume desired will not change during one training job.
+	desired int
+}
+
+// NewEtcdClient creates an EtcdClient
+func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
+	return &EtcdClient{
+		etcdTimeout:   timeout,
+		numPservers:   numPservers,
+		etcdEndpoints: endpoints,
+	}
+}
+
+// Register registers the pserver on etcd
+//
+// Register returns the index of the current pserver.
+func (e *EtcdClient) Register() (int, error) {
+
+	var err error
+	e.externalIP, err = networkhelper.GetExternalIP()
+	if err != nil {
+		return 0, err
+	}
+
+	// initialize connection to etcd.
+	ep := strings.Split(e.etcdEndpoints, ",")
+	for {
+		cli, err := clientv3.New(clientv3.Config{
+			Endpoints:   ep,
+			DialTimeout: e.etcdTimeout,
+		})
+		if err != nil {
+			log.Errorf("connect to etcd error: %v", err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		e.etcdClient = cli
+		log.Debugf("inited client to %s", e.etcdEndpoints)
+		break
+	}
+	// init /ps_desired using transaction, for multiple pservers may want to write
+	// it at the same time.
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		_, err := e.initDesiredPsercers(ctx, e.numPservers)
+		cancel()
+		if err != nil {
+			log.Warn(err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		break
+	}
+	// TODO: when implementing extending or reducing pservers, /ps_desired is
+	// changed, then we need to watch /ps_desired node for events. For now, just
+	// write once when init and read from it.
+	// wait and set s.desired init value
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		resp, err := e.etcdClient.Get(ctx, PsDesired)
+		cancel()
+		if err != nil {
+			log.Errorf("getting %s error: %v", PsDesired, err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		if len(resp.Kvs) != 0 {
+			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
+			if err != nil {
+				log.Errorf("value of %s invalid %v\n", PsDesired, err)
+				time.Sleep(e.etcdTimeout)
+				// NOTE: wait util ps_desired value change
+				continue
+			}
+			break
+		}
+	}
+
+	var pserverIdx int
+	// try register pserver node on etcd
+	for {
+		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
+		var err error
+		pserverIdx, err = e.registerPserverEtcd(ctx)
+		cancel()
+		if err != nil {
+			log.Warn(err)
+			time.Sleep(e.etcdTimeout)
+			continue
+		}
+		break
+	}
+
+	return pserverIdx, nil
+}
+
+func (e *EtcdClient) initDesiredPsercers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
+	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+		dsStr := c.Get(PsDesired)
+		if dsStr == "" {
+			c.Put(PsDesired, strconv.Itoa(numPservers))
+		}
+		return nil
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+}
+
+// registerPserverEtcd registers pserver node on etcd using transaction.
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+	var idx int
+	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+		registered := false
+		for i := 0; i < e.desired; i++ {
+			psKey := "/ps/" + strconv.Itoa(i)
+			log.Debugf("checking %s", psKey)
+			ps := c.Get(psKey)
+			log.Debugf("got value (%s) for key: %s", ps, psKey)
+
+			if ps == "" {
+				resp, err := e.etcdClient.Grant(context.TODO(), 5)
+				if err != nil {
+					log.Fatal(err)
+				}
+				// find the first id and write info
+				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
+				if kaerr != nil {
+					log.Errorf("keepalive etcd node error: %v", kaerr)
+					return kaerr
+				}
+
+				// Eat the keep alive message so etcd
+				// will not expire the lease.
+				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
+					ka := <-ch
+					log.Debugf("keepalive: %d\n", ka.TTL)
+				}(ch)
+				log.Debug("register finished")
+				idx = i
+				registered = true
+				break
+			}
+		}
+		if registered == true {
+			return nil
+		}
+		return errors.New("not registerd, may due to already have enough pservers")
+	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
+
+	if err != nil {
+		return 0, err
+	}
+
+	return idx, nil
+}
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 555d379bcbe639a9af59f55a5275b4e4dcf1887c..e15a4e5a58a3bb1a154157b1212d141478e96231 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -24,6 +24,9 @@ const (
 	Float64
 )
 
+// PsDesired is etcd path for store desired pserver count
+const PsDesired = "/ps_desired"
+
 // Parameter is a piece of data to sync with the parameter server.
 type Parameter struct {
 	Name        string
@@ -43,17 +46,21 @@ type Gradient Parameter
 // Service is the RPC service for pserver.
 type Service struct {
 	initialized chan struct{}
+	idx         int
 
 	mu     sync.Mutex
 	optMap map[string]*optimizer
 }
 
-// NewService creates a new service.
-func NewService() *Service {
-	s := &Service{}
-	s.optMap = make(map[string]*optimizer)
+// NewService creates a new service, will bypass etcd registration if no
+// endpoints specified.
+func NewService(idx int) (*Service, error) {
+	s := &Service{
+		idx: idx,
+	}
+  s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
-	return s
+	return s, nil
 }
 
 // InitParam initializes a parameter.
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 57397fe586e11f74014ee2d1a0e7ab7f4b6c3cd3..c62f92e09bb3d65aa9552e3624e5d8e1a1945e56 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -10,8 +10,12 @@ import (
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 )
 
-func TestServiceFull(t *testing.T) {
-	s := pserver.NewService()
+
+func TestFull(t *testing.T) {
+	s, err := pserver.NewService(0)
+	if err != nil {
+		t.Error(err)
+	}
 	var p pserver.Parameter
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
@@ -79,8 +83,11 @@ func TestServiceFull(t *testing.T) {
 }
 
 func TestMultipleInit(t *testing.T) {
-	s := pserver.NewService()
-	err := s.FinishInitParams(0, nil)
+	s, err := pserver.NewService(0)
+	if err != nil {
+		t.Error(err)
+	}
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -92,15 +99,18 @@ func TestMultipleInit(t *testing.T) {
 }
 
 func TestUninitialized(t *testing.T) {
-	s := pserver.NewService()
-	err := s.SendGrad(pserver.Gradient{}, nil)
+	s, err := pserver.NewService(0)
+	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
 	}
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s := pserver.NewService()
+	s, err := pserver.NewService(0)
+	if err != nil {
+		t.Error(err)
+	}
 	ch := make(chan struct{}, 2)
 	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
@@ -145,6 +155,7 @@ func TestBlockUntilInitialized(t *testing.T) {
 		t.Fatalf("read optimizer proto failed")
 	}
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
+
 	if err != nil {
 		t.FailNow()
 	}
diff --git a/go/utils/networkhelper/helper.go b/go/utils/networkhelper/helper.go
new file mode 100644
index 0000000000000000000000000000000000000000..fbeaea8f5e7d93309befbd23063e474a4c6df46e
--- /dev/null
+++ b/go/utils/networkhelper/helper.go
@@ -0,0 +1,45 @@
+package networkhelper
+
+import (
+	"errors"
+	"net"
+)
+
+// GetExternalIP returns the ip address of local network interface, not the
+// loopback device.
+func GetExternalIP() (string, error) {
+	ifaces, err := net.Interfaces()
+	if err != nil {
+		return "", err
+	}
+	for _, iface := range ifaces {
+		if iface.Flags&net.FlagUp == 0 {
+			continue // interface down
+		}
+		if iface.Flags&net.FlagLoopback != 0 {
+			continue // loopback interface
+		}
+		addrs, err := iface.Addrs()
+		if err != nil {
+			return "", err
+		}
+		for _, addr := range addrs {
+			var ip net.IP
+			switch v := addr.(type) {
+			case *net.IPNet:
+				ip = v.IP
+			case *net.IPAddr:
+				ip = v.IP
+			}
+			if ip == nil || ip.IsLoopback() {
+				continue
+			}
+			ip = ip.To4()
+			if ip == nil {
+				continue // not an ipv4 address
+			}
+			return ip.String(), nil
+		}
+	}
+	return "", errors.New("are you connected to the network?")
+}
diff --git a/go/utils/networkhelper/helper_test.go b/go/utils/networkhelper/helper_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..4208f9e358fc4345b73a2b8a9211b8889c1190d8
--- /dev/null
+++ b/go/utils/networkhelper/helper_test.go
@@ -0,0 +1,10 @@
+package networkhelper
+
+import "testing"
+
+func TestGetIP(t *testing.T) {
+	_, err := GetExternalIP()
+	if err != nil {
+		t.Errorf("GetExternalIP returns error : %v\n", err)
+	}
+}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 573bd937a351a6f308974e14f3bc92cbe1b541bc..307e99bbe3a833f1fe26057ec38d0b96e04bc0fe 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,17 +9,10 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 add_subdirectory(optimizer)
-add_subdirectory(strings)
-
-# Do not build go directory until go cmake is working smoothly.
-# if(CMAKE_Go_COMPILER)
-#   add_subdirectory(go)
-# endif()
-
-find_package(Boost QUIET)
+add_subdirectory(string)
 
 if(Boost_FOUND)
-  include_directories(${Boost_INCLUDE_DIRS})
+  add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
 endif()
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index f2315e31cc06d8b5fea7a9fd203a697bac603a90..39d8aa075bc072d37dc8df67746f0d2b503418a6 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
     Internal.h)
 
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp paddle_trainer_lib)
+add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
 
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
index 206f512563466d40e9ad1db0ddb4753ffb6bf55a..11022d17541476c97a2b29be8eb8fecce7e39435 100644
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -26,7 +26,7 @@ target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
 add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
   ${CAPI_PRIVATE_HEADER})
 
-add_dependencies(paddle_capi gen_proto_cpp)
+add_dependencies(paddle_capi paddle_proto)
 
 
 # combine all paddle static libraries together, into libpaddle_capi_whole.a
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index f9061e96deb659dcf7bfb88b46e6509af0425199..73ffa690d9d91b673079fc0ecf91f17cbabfdb1e 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -83,7 +83,7 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
-add_dependencies(paddle_cuda ${external_project_dependencies})
+add_dependencies(paddle_cuda paddle_proto ${external_project_dependencies})
 
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 673cfa19ac35116288a2481b85858b6f88f3378e..6aa6b9bc2db6a223dd8562b76ba9d777206bfd40 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -1,4 +1,7 @@
+# ddim lib
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
+cc_test(variable_test SRCS variable_test.cc)
+cc_test(scope_test SRCS scope_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc)
diff --git a/paddle/framework/ddim_test.cc b/paddle/framework/ddim_test.cc
index e5c84d7abe9d476f285c8c5cd904d2e570eb0e4f..36eef02370e0196c2af2c05f49176b70ce69235a 100644
--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -1,5 +1,3 @@
-//#include <stdexcept>
-//#include <unittest/unittest.h>
 #include <sstream>
 #include <vector>
 
diff --git a/paddle/framework/enforce.h b/paddle/framework/enforce.h
new file mode 100644
index 0000000000000000000000000000000000000000..56cb7f95647e81efef58b156002d0d378ee22820
--- /dev/null
+++ b/paddle/framework/enforce.h
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <paddle/string/printf.h>
+#include <exception>
+#include <sstream>
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Enforce exception. Inherits std::exception
+ *
+ * All enforce condition not met, will throw an EnforceNotMet exception.
+ */
+class EnforceNotMet : public std::exception {
+ public:
+  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
+    std::ostringstream sout;
+    sout << msg << " at [" << file << ":" << fileline << "];";
+    all_msg_ = sout.str();
+  }
+
+  const char* what() const noexcept override { return all_msg_.c_str(); }
+
+ private:
+  std::string all_msg_;
+};
+
+// From https://stackoverflow.com/questions/30130930/
+// __buildin_expect is in C++ 11 standard. Since the condition which enforced
+// should be true in most situation, it will make the compiler generate faster
+// code by adding `UNLIKELY` macro.
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+
+/**
+ * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
+ * __LINE__
+ *
+ * This macro take __VA_ARGS__, user can pass any type if that type can
+ * serialize to std::ostream
+ */
+#define PADDLE_THROW(...)                                            \
+  do {                                                               \
+    throw ::paddle::framework::EnforceNotMet(                        \
+        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
+  } while (0)
+
+/**
+ * @brief Enforce a condition, otherwise throw an EnforceNotMet
+ */
+#define PADDLE_ENFORCE(condition, ...) \
+  do {                                 \
+    if (UNLIKELY(!(condition))) {      \
+      PADDLE_THROW(__VA_ARGS__);       \
+    }                                  \
+  } while (0)
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/enforce_test.cc b/paddle/framework/enforce_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f8da1a192f63a54324d80725c9d2f156fb11a481
--- /dev/null
+++ b/paddle/framework/enforce_test.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/framework/enforce.h>
+
+TEST(ENFORCE, OK) {
+  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
+  size_t val = 1;
+  const size_t limit = 10;
+  PADDLE_ENFORCE(val < limit, "Enforce is OK too");
+}
+
+TEST(ENFORCE, FAILED) {
+  bool in_catch = false;
+  try {
+    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
+  } catch (paddle::framework::EnforceNotMet err) {
+    in_catch = true;
+    std::string msg = "Enforce is not ok 123 at all";
+    const char* what = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(what[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(in_catch);
+}
\ No newline at end of file
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4470f726fb0d59a82db29b3239c111ea1569c55
--- /dev/null
+++ b/paddle/framework/scope.h
@@ -0,0 +1,95 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/framework/variable.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief Scope that manage all variables.
+ *
+ * Scope is an association of a name to Variable. All variables belong to
+ * Scope. You need to specify a scope to run a Net, i.e., `net.Run(&scope)`.
+ * One net can run in different scopes and update different variable in the
+ * scope.
+ */
+class Scope {
+ public:
+  /**
+   * @brief Initialize s Scope without parent.
+   */
+  Scope() {}
+
+  /**
+   * @brief Initialize a Scope with parent.
+   */
+  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
+
+  /**
+   * @brief Create Variable
+   *
+   * Create Variable in this Scope. Return the exist one if Variable already
+   * been created.
+   */
+  Variable* CreateVariable(const std::string& name) {
+    auto var = GetVariable(name);
+    if (var) {
+      return var;
+    } else {
+      vars_[name] = std::unique_ptr<Variable>(new Variable());
+      return GetVariable(name);
+    }
+  }
+
+  /**
+   * @brief Get Variable.
+   *
+   * Get Variable from this Scope, this function will recursive find Variable
+   * from it's parent scope. Return nullptr if not found.
+   */
+  Variable* GetVariable(const std::string& name) const {
+    auto it = vars_.find(name);
+    if (it != vars_.end()) {
+      return it->second.get();
+    } else if (parent_ != nullptr) {
+      return parent_->GetVariable(name);
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief If this scope has a Var named name.
+   *
+   * Find if there is a Variable in this scope and it's parent scope
+   */
+  bool HasVariable(const std::string& name) const {
+    return (vars_.find(name) != vars_.end() ||
+            (parent_ && parent_->HasVariable(name)));
+  }
+
+ private:
+  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  std::shared_ptr<Scope> parent_{nullptr};
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..df1afb200ce9e75c5b1e40f2da56667487ae3576
--- /dev/null
+++ b/paddle/framework/scope_test.cc
@@ -0,0 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/scope.h"
+#include "gtest/gtest.h"
+
+TEST(Scope, Create) {
+  using paddle::framework::Scope;
+  using paddle::framework::Variable;
+
+  auto scope = std::make_shared<Scope>();
+
+  Variable* var0 = scope->CreateVariable("");
+  EXPECT_NE(var0, nullptr);
+
+  /// GetVariable will return nullptr if not exist.
+  Variable* var1 = scope->GetVariable("a");
+  EXPECT_EQ(var1, nullptr);
+
+  /// CreateVariable will return one.
+  Variable* var2 = scope->CreateVariable("a");
+  EXPECT_NE(var2, nullptr);
+
+  /// Get the created variable.
+  Variable* var3 = scope->GetVariable("a");
+  EXPECT_EQ(var2, var3);
+
+  /// CreateVariable will just return the variable if it's
+  /// already exist.
+  Variable* var4 = scope->CreateVariable("a");
+  EXPECT_EQ(var4, var2);
+}
+
+TEST(Scope, Parent) {
+  using paddle::framework::Scope;
+  using paddle::framework::Variable;
+
+  auto parent_scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<Scope>(parent_scope);
+
+  Variable* var0 = parent_scope->CreateVariable("a");
+  EXPECT_NE(var0, nullptr);
+
+  /// GetVariable will get Variable from parent scope if exist.
+  Variable* var1 = scope->GetVariable("a");
+  EXPECT_EQ(var0, var1);
+}
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
new file mode 100644
index 0000000000000000000000000000000000000000..72c4a7a2a1d1cf93a784f24e687727ee8481484c
--- /dev/null
+++ b/paddle/framework/variable.h
@@ -0,0 +1,71 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+#pragma once
+
+#include <memory>
+#include <typeindex>
+#include <typeinfo>
+
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace framework {
+
+class Variable {
+ public:
+  template <typename T>
+  const T& Get() const {
+    PADDLE_ASSERT(IsType<T>());
+    return *static_cast<const T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  T* GetMutable() {
+    if (!IsType<T>()) {
+      holder_.reset(new PlaceholderImpl<T>(new T()));
+    }
+    return static_cast<T*>(holder_->Ptr());
+  }
+
+  template <typename T>
+  bool IsType() const {
+    return holder_ != nullptr &&
+           std::type_index(typeid(T)) == std::type_index(holder_->Type());
+  }
+
+ private:
+  struct Placeholder {
+    virtual ~Placeholder() {}
+    virtual const std::type_info& Type() const = 0;
+    virtual void* Ptr() const = 0;
+  };
+
+  // Placeholder hides type T, so it doesn't appear as a template
+  // parameter of Variable.
+  template <typename T>
+  struct PlaceholderImpl : public Placeholder {
+    PlaceholderImpl(T* ptr) : ptr_(ptr), type_(typeid(T)) {}
+
+    virtual const std::type_info& Type() const { return type_; }
+    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+
+    std::unique_ptr<T> ptr_;
+    const std::type_info& type_;
+  };
+
+  std::unique_ptr<Placeholder>
+      holder_;  // pointers to a PlaceholderImpl object indeed.
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/variable.md b/paddle/framework/variable.md
new file mode 100644
index 0000000000000000000000000000000000000000..f44d5ea46e7ce98dd443d684ad42308496bc4179
--- /dev/null
+++ b/paddle/framework/variable.md
@@ -0,0 +1,52 @@
+# Design Doc: Variable
+
+
+Variable is also known as *blob* in MxNet and Caffe2.  It is the input and output type of operators, where a neural network is a graph of operators.
+
+## Requirements: Lazy Memory Allocation
+
+For the flexibility of a DL system, a variable should be able to contain any typed value -- a tensor in most cases, but could also be some integer IDs or a scope of other variables in the case of RNN.
+
+To use the minimum amount of memory, we'd like that a variable to allocate memory when it has to, or, lazy memory allocation.  Let's take the following example:
+
+```cpp
+Variable vr, v1, v2;
+
+Tensor* t1 = new Tensor();
+Tensor* t2 = new Tensor();
+
+Randomize(
+  /* malloc */ v1.GetMutable<Tensor>().mutable_data<float16>(DDim(100,200)),
+  /* size */ t1.Size());
+  
+Randomize(
+  /* malloc */ v2.GetMutable<Tensor>().mutable_data<float16>(DDim(200,300)),
+  /* size */ t2.Size());
+  
+Mult(
+  /*result*/ vr.GetMutable<Tensor>().mutable_data<v1.Type()>(SizeOfMult(v1, v2)),
+  /*input1*/ v1.Get<Tensor>().data(),
+  /*input2*/ v2.Get<Tensor>().data());
+```
+     
+We see that a variable holds nothing until `Variable::GetMutable<Tensor>()` allocates a tensor and puts it in the variable.  Similarly, a tensor gets its memory until `Tensor::mutable_data()`.
+
+This syntax for lazy memory allocation when we call `Randomize` and `Mult`, those functions that mutate the variable, so it saves us some line of C++ code.
+
+
+## Implementation: Type Hiding
+
+To make memory allocation lazy, we cannot assume that we know the type held by a variable at definition time.  In other words, `class Variable` cannot be a template `template <T> class Variable`.
+
+Because we don't know the type `T`, we cannot save a `T*` as `Variable's` data member.  Instead, we save an interface object `Placeholder`, who can return the pointer to the saved object via `Placeholder::Ptr()` as `void*`.
+
+But anyway, Variable needs to know `T` so could it `delete<T>(ptr)` and so could `Variable::Get` checks the expected type and the saved object's type.
+
+We save `T` in `PlaceholderImpl`, the implementation of `Placeholder`.  Please be aware that `PlaceholderImpl` is a class template and `T` is passed in as a template parameter.
+
+Because `PlaceholderImpl` knows `T`, it can save and return `typeid(T)` for the type comparison in `Variable::Get` and `Variable::GetMutable`.
+
+
+## Conclusion
+
+The technique type hiding utilizes C++ class templates, interface and derivation, and C++ RTTI (typeid).  This combination saves us from definition something like `caffe2::TypeMata`, which takes hundreds of lines of C++ code.
diff --git a/paddle/framework/variable_test.cc b/paddle/framework/variable_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..aea03bcf5719dacc01d2d78b52b33e8a0b29b5e5
--- /dev/null
+++ b/paddle/framework/variable_test.cc
@@ -0,0 +1,40 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include <memory>
+#include <string>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/variable.h"
+
+TEST(Variable, GetMutable) {
+  using paddle::framework::Variable;
+
+  struct Tensor {
+    int content_;
+  };
+
+  std::unique_ptr<Variable> v(new Variable());
+
+  Tensor* t = v->GetMutable<Tensor>();
+  t->content_ = 1234;
+
+  const Tensor& tt = v->Get<Tensor>();
+  EXPECT_EQ(1234, tt.content_);
+
+  std::string* s = v->GetMutable<std::string>();
+  *s = "hello";
+
+  const std::string& ss = v->Get<std::string>();
+  EXPECT_EQ("hello", ss);
+}
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 5e170714cf5b183fcf6e76d34746333397e6b060..1c39ced3c9e3da4079a66e29c00be9cc18411b68 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-add_dependencies(paddle_function gen_proto_cpp)
+add_dependencies(paddle_function paddle_proto)
 
 if(WITH_TESTING)
 if(WITH_GPU)
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 93a6a99848aa13bb36c9c5c7091fbaa891fc9823..0012636b8f618a1b45cfc801c04781e67694956f 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -58,7 +58,7 @@ endif()
 
 add_style_check_target(paddle_gserver ${GSERVER_SOURCES})
 add_style_check_target(paddle_gserver ${GSERVER_HEADER})
-add_dependencies(paddle_gserver gen_proto_cpp)
+add_dependencies(paddle_gserver paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b825db574cf8bac2cf7b7538d0583a8adc2c158
--- /dev/null
+++ b/paddle/gserver/evaluators/DetectionMAPEvaluator.cpp
@@ -0,0 +1,308 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Evaluator.h"
+#include "paddle/gserver/layers/DetectionUtil.h"
+
+using std::map;
+using std::vector;
+using std::pair;
+using std::make_pair;
+
+namespace paddle {
+
+/**
+ * @brief detection map Evaluator
+ *
+ * The config file api is detection_map_evaluator.
+ */
+class DetectionMAPEvaluator : public Evaluator {
+public:
+  DetectionMAPEvaluator()
+      : evaluateDifficult_(false), cpuOutput_(nullptr), cpuLabel_(nullptr) {}
+
+  virtual void start() {
+    Evaluator::start();
+    allTruePos_.clear();
+    allFalsePos_.clear();
+    numPos_.clear();
+  }
+
+  virtual real evalImp(std::vector<Argument>& arguments) {
+    overlapThreshold_ = config_.overlap_threshold();
+    backgroundId_ = config_.background_id();
+    evaluateDifficult_ = config_.evaluate_difficult();
+    apType_ = config_.ap_type();
+
+    MatrixPtr detectTmpValue = arguments[0].value;
+    Matrix::resizeOrCreate(cpuOutput_,
+                           detectTmpValue->getHeight(),
+                           detectTmpValue->getWidth(),
+                           false,
+                           false);
+
+    MatrixPtr labelTmpValue = arguments[1].value;
+    Matrix::resizeOrCreate(cpuLabel_,
+                           labelTmpValue->getHeight(),
+                           labelTmpValue->getWidth(),
+                           false,
+                           false);
+
+    cpuOutput_->copyFrom(*detectTmpValue);
+    cpuLabel_->copyFrom(*labelTmpValue);
+
+    Argument label = arguments[1];
+    const int* labelIndex = label.sequenceStartPositions->getData(false);
+    size_t batchSize = label.getNumSequences();
+
+    vector<map<size_t, vector<NormalizedBBox>>> allGTBBoxes;
+    vector<map<size_t, vector<pair<real, NormalizedBBox>>>> allDetectBBoxes;
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      map<size_t, vector<NormalizedBBox>> bboxes;
+      for (int i = labelIndex[n]; i < labelIndex[n + 1]; ++i) {
+        vector<NormalizedBBox> bbox;
+        getBBoxFromLabelData(cpuLabel_->getData() + i * 6, 1, bbox);
+        int c = cpuLabel_->getData()[i * 6];
+        bboxes[c].push_back(bbox[0]);
+      }
+      allGTBBoxes.push_back(bboxes);
+    }
+
+    size_t n = 0;
+    const real* cpuOutputData = cpuOutput_->getData();
+    for (size_t imgId = 0; imgId < batchSize; ++imgId) {
+      map<size_t, vector<pair<real, NormalizedBBox>>> bboxes;
+      size_t curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      while (curImgId == imgId && n < cpuOutput_->getHeight()) {
+        vector<real> label;
+        vector<real> score;
+        vector<NormalizedBBox> bbox;
+        getBBoxFromDetectData(cpuOutputData + n * 7, 1, label, score, bbox);
+        bboxes[label[0]].push_back(make_pair(score[0], bbox[0]));
+        ++n;
+        curImgId = static_cast<size_t>((cpuOutputData + n * 7)[0]);
+      }
+      allDetectBBoxes.push_back(bboxes);
+    }
+
+    for (size_t n = 0; n < batchSize; ++n) {
+      for (map<size_t, vector<NormalizedBBox>>::iterator it =
+               allGTBBoxes[n].begin();
+           it != allGTBBoxes[n].end();
+           ++it) {
+        size_t count = 0;
+        if (evaluateDifficult_) {
+          count = it->second.size();
+        } else {
+          for (size_t i = 0; i < it->second.size(); ++i)
+            if (!(it->second[i].isDifficult)) ++count;
+        }
+        if (numPos_.find(it->first) == numPos_.end() && count != 0) {
+          numPos_[it->first] = count;
+        } else {
+          numPos_[it->first] += count;
+        }
+      }
+    }
+
+    // calcTFPos
+    calcTFPos(batchSize, allGTBBoxes, allDetectBBoxes);
+
+    return 0;
+  }
+
+  virtual void printStats(std::ostream& os) const {
+    real mAP = calcMAP();
+    os << "Detection mAP=" << mAP;
+  }
+
+  virtual void distributeEval(ParameterClient2* client) {
+    LOG(FATAL) << "Distribute detection evaluation not implemented.";
+  }
+
+protected:
+  void calcTFPos(const size_t batchSize,
+                 const vector<map<size_t, vector<NormalizedBBox>>>& allGTBBoxes,
+                 const vector<map<size_t, vector<pair<real, NormalizedBBox>>>>&
+                     allDetectBBoxes) {
+    for (size_t n = 0; n < allDetectBBoxes.size(); ++n) {
+      if (allGTBBoxes[n].size() == 0) {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          for (size_t i = 0; i < it->second.size(); ++i) {
+            allTruePos_[label].push_back(make_pair(it->second[i].first, 0));
+            allFalsePos_[label].push_back(make_pair(it->second[i].first, 1));
+          }
+        }
+      } else {
+        for (map<size_t, vector<pair<real, NormalizedBBox>>>::const_iterator
+                 it = allDetectBBoxes[n].begin();
+             it != allDetectBBoxes[n].end();
+             ++it) {
+          size_t label = it->first;
+          vector<pair<real, NormalizedBBox>> predBBoxes = it->second;
+          if (allGTBBoxes[n].find(label) == allGTBBoxes[n].end()) {
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+              allFalsePos_[label].push_back(make_pair(predBBoxes[i].first, 1));
+            }
+          } else {
+            vector<NormalizedBBox> gtBBoxes =
+                allGTBBoxes[n].find(label)->second;
+            vector<bool> visited(gtBBoxes.size(), false);
+            // Sort detections in descend order based on scores
+            std::sort(predBBoxes.begin(),
+                      predBBoxes.end(),
+                      sortScorePairDescend<NormalizedBBox>);
+            for (size_t i = 0; i < predBBoxes.size(); ++i) {
+              real maxOverlap = -1.0;
+              size_t maxIdx = 0;
+              for (size_t j = 0; j < gtBBoxes.size(); ++j) {
+                real overlap =
+                    jaccardOverlap(predBBoxes[i].second, gtBBoxes[j]);
+                if (overlap > maxOverlap) {
+                  maxOverlap = overlap;
+                  maxIdx = j;
+                }
+              }
+              if (maxOverlap > overlapThreshold_) {
+                if (evaluateDifficult_ ||
+                    (!evaluateDifficult_ && !gtBBoxes[maxIdx].isDifficult)) {
+                  if (!visited[maxIdx]) {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    visited[maxIdx] = true;
+                  } else {
+                    allTruePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 0));
+                    allFalsePos_[label].push_back(
+                        make_pair(predBBoxes[i].first, 1));
+                  }
+                }
+              } else {
+                allTruePos_[label].push_back(make_pair(predBBoxes[i].first, 0));
+                allFalsePos_[label].push_back(
+                    make_pair(predBBoxes[i].first, 1));
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  real calcMAP() const {
+    real mAP = 0.0;
+    size_t count = 0;
+    for (map<size_t, size_t>::const_iterator it = numPos_.begin();
+         it != numPos_.end();
+         ++it) {
+      size_t label = it->first;
+      size_t labelNumPos = it->second;
+      if (labelNumPos == 0 || allTruePos_.find(label) == allTruePos_.end())
+        continue;
+      vector<pair<real, size_t>> labelTruePos = allTruePos_.find(label)->second;
+      vector<pair<real, size_t>> labelFalsePos =
+          allFalsePos_.find(label)->second;
+      // Compute average precision.
+      vector<size_t> tpCumSum;
+      getAccumulation(labelTruePos, &tpCumSum);
+      vector<size_t> fpCumSum;
+      getAccumulation(labelFalsePos, &fpCumSum);
+      std::vector<real> precision, recall;
+      size_t num = tpCumSum.size();
+      // Compute Precision.
+      for (size_t i = 0; i < num; ++i) {
+        CHECK_LE(tpCumSum[i], labelNumPos);
+        precision.push_back(static_cast<real>(tpCumSum[i]) /
+                            static_cast<real>(tpCumSum[i] + fpCumSum[i]));
+        recall.push_back(static_cast<real>(tpCumSum[i]) / labelNumPos);
+      }
+      // VOC2007 style
+      if (apType_ == "11point") {
+        vector<real> maxPrecisions(11, 0.0);
+        int startIdx = num - 1;
+        for (int j = 10; j >= 0; --j)
+          for (int i = startIdx; i >= 0; --i) {
+            if (recall[i] < j / 10.) {
+              startIdx = i;
+              if (j > 0) maxPrecisions[j - 1] = maxPrecisions[j];
+              break;
+            } else {
+              if (maxPrecisions[j] < precision[i])
+                maxPrecisions[j] = precision[i];
+            }
+          }
+        for (int j = 10; j >= 0; --j) mAP += maxPrecisions[j] / 11;
+        ++count;
+      } else if (apType_ == "Integral") {
+        // Nature integral
+        real averagePrecisions = 0.;
+        real prevRecall = 0.;
+        for (size_t i = 0; i < num; ++i) {
+          if (fabs(recall[i] - prevRecall) > 1e-6)
+            averagePrecisions += precision[i] * fabs(recall[i] - prevRecall);
+          prevRecall = recall[i];
+        }
+        mAP += averagePrecisions;
+        ++count;
+      } else {
+        LOG(FATAL) << "Unkown ap version: " << apType_;
+      }
+    }
+    if (count != 0) mAP /= count;
+    return mAP * 100;
+  }
+
+  void getAccumulation(vector<pair<real, size_t>> inPairs,
+                       vector<size_t>* accuVec) const {
+    std::stable_sort(
+        inPairs.begin(), inPairs.end(), sortScorePairDescend<size_t>);
+    accuVec->clear();
+    size_t sum = 0;
+    for (size_t i = 0; i < inPairs.size(); ++i) {
+      sum += inPairs[i].second;
+      accuVec->push_back(sum);
+    }
+  }
+
+  std::string getTypeImpl() const { return "detection_map"; }
+
+  real getValueImpl() const { return calcMAP(); }
+
+private:
+  real overlapThreshold_;  // overlap threshold when determining whether matched
+  bool evaluateDifficult_;  // whether evaluate difficult ground truth
+  size_t backgroundId_;     // class index of background
+  std::string apType_;      // how to calculate mAP (Integral or 11point)
+
+  MatrixPtr cpuOutput_;
+  MatrixPtr cpuLabel_;
+
+  map<size_t, size_t> numPos_;  // counts of true objects each classification
+  map<size_t, vector<pair<real, size_t>>>
+      allTruePos_;  // true positive prediction
+  map<size_t, vector<pair<real, size_t>>>
+      allFalsePos_;  // false positive prediction
+};
+
+REGISTER_EVALUATOR(detection_map, DetectionMAPEvaluator);
+
+}  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 3159026e6b92355ba7480b09535388c969a504e2..018da6c76dc27a74b074ec52c18347beba8164fc 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
 
   outArgStream_ = HPPL_STREAM_1;
 
+  start();
+}
+
+void MultiGradientMachine::start() {
   for (auto& thread : threads_) {
     thread->start();
   }
 }
 
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
 std::vector<const std::vector<ParameterPtr>*>
 MultiGradientMachine::getSlaveParameters() {
   std::vector<const std::vector<ParameterPtr>*> vec;
@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
   }
 }
 
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
 Evaluator* MultiGradientMachine::makeEvaluator() const {
   return threads_[0]->getGradientMachine()->makeEvaluator();
 }
@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
-  stopping_ = false;
+  stopping_ = true;
   updateCounter_ = 0;
   parameterUpdated_ = false;
 }
@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
   gradientMachine_->start();
 
   computeThread_.reset(new std::thread([this]() { computeThread(); }));
@@ -593,7 +601,7 @@ void TrainerThread::backward() {
 
 void TrainerThread::backwardCallback(Parameter* para) {
   // CPU parameters are merged in the end
-  if (!para->useGpu()) return;
+  if (!para->useGpu() || para->isStatic()) return;
 
   int paramId = para->getID();
   if (multiMachine_->getNumThreads() == 1) {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 70203bbb97fe79d72fbc6bd2b5d427cb1de7b61f..5e7622f929fd57de6e38855528a752b5586c4cd1 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -176,6 +176,10 @@ public:
 
   explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
 
+  virtual void start();
+
+  virtual void finish();
+
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
@@ -193,8 +197,6 @@ public:
 
   virtual void onPassEnd();
 
-  virtual void finish();
-
   virtual Evaluator* makeEvaluator() const;
 
   virtual void eval(Evaluator* evaluator) const;
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 4512aacc81f86bf87fc9ea30adcf081327663f16..2e839f640503b8f4e390fc87d9d59960dbc37f6e 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -241,11 +241,14 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
     dataLayers_[i]->setData(inArgs[i]);
   }
 
+  gLayerStackTrace.set_stage(true);
+
   {
     for (auto& layer : layers_) {
       REGISTER_TIMER_INFO("ForwardTimer", layer->getName().c_str());
       gLayerStackTrace.push(layer->getName());
       layer->forward(passType);
+      gLayerStackTrace.pop(layer->getName());
     }
   }
 
@@ -254,9 +257,6 @@ void NeuralNetwork::forward(const std::vector<Argument>& inArgs,
   for (auto& layer : outputLayers_) {
     outArgs->push_back(layer->getOutput());
   }
-  if (passType == PASS_TEST) {
-    gLayerStackTrace.clear();
-  }
 }
 
 void NeuralNetwork::resetState() {
@@ -283,9 +283,10 @@ void NeuralNetwork::getState(MachineState& machineState) {
 }
 
 void NeuralNetwork::backward(const UpdateCallback& callback) {
-  gLayerStackTrace.pop("");  // tell layer trace is during backward.
+  gLayerStackTrace.set_stage(false);
   FOR_EACH_R(layer, layers_) {
     REGISTER_TIMER_INFO("BackwardTimer", (*layer)->getName().c_str());
+    gLayerStackTrace.push((*layer)->getName());
     if ((*layer)->needGradient()) {
       (*layer)->backward(callback);
     }
@@ -308,35 +309,35 @@ public:
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
-  virtual void start() {
+  void start() override {
     for (auto& evaluator : evaluators_) {
       evaluator->start();
     }
   }
 
-  virtual void finish() {
+  void finish() override {
     for (auto& evaluator : evaluators_) {
       evaluator->finish();
     }
   }
 
-  virtual void eval(const NeuralNetwork& nn) {
+  void eval(const NeuralNetwork& nn) override {
     for (auto& evaluator : evaluators_) {
       evaluator->eval(nn);
     }
   }
-  virtual real evalImp(std::vector<Argument>& arguments) {
+  real evalImp(std::vector<Argument>& arguments) override {
     (void)arguments;
     return -1;
   }
-  virtual void printStats(std::ostream& os) const {
+  void printStats(std::ostream& os) const override {
     for (auto& evaluator : evaluators_) {
       evaluator->printStats(os);
       os << ' ';
     }
   }
 
-  virtual void distributeEval(ParameterClient2* client) {
+  void distributeEval(ParameterClient2* client) override {
     for (auto& evaluator : evaluators_) {
       evaluator->distributeEval(client);
     }
@@ -351,7 +352,7 @@ public:
    * @brief getNames will return all inside evaluators' names.
    * @param names [out]: return names.
    */
-  void getNames(std::vector<std::string>* names) {
+  void getNames(std::vector<std::string>* names) override {
     for (auto& eval : evaluators_) {
       eval->getNames(names);
     }
@@ -360,7 +361,7 @@ public:
   /**
    * @brief getValue could get all inside evaluators' value.
    */
-  real getValue(const std::string& name, Error* err) const {
+  real getValue(const std::string& name, Error* err) const override {
     return this->getMethodHelper<real>(
         name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
           return eval->getValue(name, err);
@@ -370,7 +371,7 @@ public:
   /**
    * @brief getType could get all inside evaluators' type.
    */
-  std::string getType(const std::string& name, Error* err) const {
+  std::string getType(const std::string& name, Error* err) const override {
     return this->getMethodHelper<std::string>(
         name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
           return eval->getType(name, err);
@@ -395,6 +396,30 @@ private:
   }
 };
 
+class SubnetEvaluator : public CombinedEvaluator {
+public:
+  SubnetEvaluator(const std::string& layerName,
+                  std::unique_ptr<Evaluator>&& evaluator)
+      : layerName_(layerName) {
+    addEvaluator(std::move(evaluator));
+  }
+  virtual void eval(const NeuralNetwork& nn) override {
+    const LayerPtr& layer = nn.getLayer(layerName_);
+    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
+                 << nn.getName();
+    bool accessed = false;
+    layer->accessSubNetwork([this, &accessed](NeuralNetwork& subnet) {
+      subnet.eval(evaluators_[0].get());
+      accessed = true;
+    });
+    CHECK(accessed) << "There is no subnetwork for layer " << layerName_
+                    << " in submodel " << nn.getName();
+  }
+
+protected:
+  std::string layerName_;
+};
+
 Evaluator* NeuralNetwork::makeEvaluator() const {
   CombinedEvaluator* combinedEvaluator = new CombinedEvaluator();
   auto subModelConfig = std::find_if(config_.sub_models().begin(),
@@ -421,6 +446,15 @@ Evaluator* NeuralNetwork::makeEvaluator() const {
         combinedEvaluator->addEvaluator(std::move(evaluator));
       }
     }
+    for (auto& layer : layers_) {
+      layer->accessSubNetwork(
+          [layer, combinedEvaluator](NeuralNetwork& subnet) {
+            std::unique_ptr<Evaluator> subEvaluator(new SubnetEvaluator(
+                layer->getName(),
+                std::unique_ptr<Evaluator>(subnet.makeEvaluator())));
+            combinedEvaluator->addEvaluator(std::move(subEvaluator));
+          });
+    }
   } else {
     for (const EvaluatorConfig& evalConfig : config_.evaluators()) {
       std::unique_ptr<Evaluator> evaluator(Evaluator::create(evalConfig));
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index e7b6c438407e7eab6eab1f6ed496f35caa9f2177..12810f642519b7965fc1b7d751290445e3350dd5 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -129,6 +129,8 @@ public:
   static NeuralNetwork* newNeuralNetwork(const std::string& name = "",
                                          NeuralNetwork* rootNetwork = nullptr);
 
+  const std::string& getName() const { return subModelName_; }
+
 protected:
   /**
    * The constructor of NeuralNetwork.
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 3e930380226bce58cc90704b4c4cfa36e9f70968..9a972466d66ba1417b2c31e66dc375b3da229aa8 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -208,6 +208,7 @@ void RecurrentGradientMachine::init(
                    });
   CHECK(subModelConfig != config.sub_models().end());
   reversed_ = subModelConfig->reversed();
+  generating_ = subModelConfig->has_generator();
 
   inFrameLines_.resize(subModelConfig->in_links_size());
   for (size_t i = 0; i < inFrameLines_.size(); ++i) {
@@ -287,10 +288,6 @@ void RecurrentGradientMachine::init(
       parameterIds_.push_back(para->getID());
     }
   }
-
-  if (subModelConfig->evaluator_names_size() > 0) {
-    evaluator_.reset(frames_[0]->makeEvaluator());
-  }
 }
 
 void RecurrentGradientMachine::resizeOrCreateFrames(int numFrames) {
@@ -538,7 +535,7 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
      The outputs are outFramesLines_[i].agentLayer
    */
 
-  if (inFrameLines_.empty() && passType == PASS_TEST) {
+  if (generating_) {
     generateSequence();
     return;
   }  // else forward..
@@ -561,14 +558,14 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
     std::vector<Argument> outArgs;
     frames_[i]->forward(inArgs, &outArgs, passType);
   }
-  if (evaluator_ && passType == PASS_TEST) {
-    this->eval(evaluator_.get());
-  }
 
   reorganizeOutput(passType);
 }
 
 void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
+  if (generating_) {
+    return;
+  }
   REGISTER_TIMER_INFO("RecurrentBwTime", "RecurrentBwTime");
   AsyncGpuBlock asyncGpuBlock;
   for (int i = maxSequenceLength_ - 1; i >= 0; --i) {
@@ -577,11 +574,6 @@ void RecurrentGradientMachine::backward(const UpdateCallback& callback) {
   for (auto& memoryFrameLine : memoryFrameLines_) {
     memoryFrameLine.bootLayer->backward(nullptr);
   }
-
-  // call printers here so the gradient can be printed
-  if (evaluator_) {
-    this->eval(evaluator_.get());
-  }
 }
 
 void RecurrentGradientMachine::forwardBackward(
@@ -595,9 +587,9 @@ void RecurrentGradientMachine::forwardBackward(
 void RecurrentGradientMachine::eval(Evaluator* evaluator) const {
   // call printers frame by frame
   for (int i = 0; i < maxSequenceLength_; ++i) {
-    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " begin";
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " begin";
     evaluator->eval(*(frames_[i].get()));
-    LOG(INFO) << "Recurrent Layer Group eval frame " << i << " end";
+    VLOG(2) << "Recurrent Layer Group eval frame " << i << " end";
   }
 }
 
@@ -1093,10 +1085,6 @@ void RecurrentGradientMachine::oneWaySearch(size_t batchSize) {
 
     copyDataOutlinkFrame(machineCur);
 
-    // call value printer
-    if (evaluator_) {
-      evaluator_->eval(*(frames_[machineCur].get()));
-    }
     // check eos
     const IVectorPtr& eosVec =
         eosFrameLine_->layers[machineCur]->getOutput().ids;
@@ -1321,11 +1309,10 @@ void RecurrentGradientMachine::fillGenOutputs() {
 
   batchMachineIdVec_.clear();
   generator_.ids.clear();
+  int* starts = generator_.outArg.sequenceStartPositions->getMutableData(false);
+  starts[0] = 0;
   if (numResults > 1) {
     real* probs = generator_.outArg.in->getData();
-    int* starts =
-        generator_.outArg.sequenceStartPositions->getMutableData(false);
-    starts[0] = 0;
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       for (size_t j = 0; j < finalPaths_[i].size(); ++j) {
         Path& path = finalPaths_[i][j];
@@ -1348,7 +1335,10 @@ void RecurrentGradientMachine::fillGenOutputs() {
   } else {
     for (size_t i = 0; i < finalPaths_.size(); ++i) {
       CHECK(!finalPaths_[i].empty());
-      generator_.ids = finalPaths_[i][0].ids;
+      generator_.ids.insert(generator_.ids.begin(),
+                            finalPaths_[i][0].ids.begin(),
+                            finalPaths_[i][0].ids.end());
+      starts[i + 1] = starts[i] + finalPaths_[i][0].ids.size();
     }
   }
 }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 8d94d7e2df216c4657d759c16dd6b1f2848996e0..f245620cf668bb341df99cf498105cbd996a6b24 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -414,6 +414,7 @@ protected:
     std::vector<int> ids;  // store generated sequences
     Argument outArg;       // final output argument
   };
+  bool generating_;
   Generator generator_;
 
   std::vector<std::unique_ptr<NeuralNetwork>> frames_;
@@ -428,8 +429,6 @@ protected:
   std::vector<int>
       parameterIds_;  // parameters actually used by this Layer Group
 
-  std::unique_ptr<Evaluator> evaluator_;  // frame printers in this layer group
-
   // store final argument of outFrameLines_
   std::vector<Argument> dataArgs_;
   // store each frame's output argument of outFrameLines_
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 31463823b3fc04cc24068d95887a9d3ed25a6168..15e7411b5fde0fa3a532394cf7d0e8477ef052d0 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -109,6 +109,40 @@ void GatherAgentLayer::forwardValue(PassType passType) {
   }
 }
 
+namespace {
+
+// dest[index[i]] <- src[i] for each i
+void copyElements(const IVector& srcVec,
+                  const IVector& indexVec,
+                  IVector& destVec) {
+  const int* src = srcVec.getData();
+  const int* index = indexVec.getData();
+  int* dest = destVec.getData();
+  int len = indexVec.getSize();
+  CHECK_EQ(srcVec.getSize(), indexVec.getSize());
+  for (int i = 0; i < len; ++i) {
+    dest[index[i]] = src[i];
+  }
+}
+}
+
+void GatherAgentLayer::forwardIds(PassType passType) {
+  IVectorPtr realId = realLayers_[0]->getOutputLabel();
+  if (!realId) return;
+
+  IVector::resizeOrCreate(output_.ids, allIds_->getSize(), useGpu_);
+  IVectorPtr outId = output_.ids;
+  idsVec_.resize(idIndex_.size());
+
+  for (size_t i = 0; i < realLayers_.size(); ++i) {
+    const IVectorPtr& realId = realLayers_[i]->getOutputLabel();
+    idsVec_[i] = IVector::create(allIds_->getData() + idIndex_[i],
+                                 /* size */ realId->getSize(),
+                                 useGpu_);
+    execViaCpu(&copyElements, *realId, *idsVec_[i], *outId);
+  }
+}
+
 void GatherAgentLayer::backward(const UpdateCallback& callback) {
   (void)callback;
   const MatrixPtr& outputGrad = getOutputGrad();
@@ -136,23 +170,22 @@ void ScatterAgentLayer::forward(PassType passType) {
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
   int width = this->getSize();
-  if (realOutArg_.hasSeq()) {
-    forwardSequence(passType);
-  } else if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(
-        realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
-  } else {  // used in generation
-    if (realLayer_->getOutput().ids) {
-      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-    }
-    if (realLayer_->getOutput().value) {
-      int height = ids_->getSize();
-      resetOutput(height, width);
-
-      const MatrixPtr& outV = getOutputValue();
-      const MatrixPtr& realV = realLayer_->getOutputValue();
-      outV->selectRows(*realV, *ids_);
+  if (selectionMode_) {
+    forwardWithSelection(passType);
+  } else {
+    if (realOutArg_.hasSeq()) {
+      output_.subArgFrom(realOutArg_,
+                         /* offset */ idIndex_,
+                         idSize_,
+                         width,
+                         useGpu_,
+                         /* trans */ false,
+                         /* seqFlag */ true,
+                         /* seqStart */ seqStartPosIndex_,
+                         /* seqSize */ numSequences_);
+    } else {
+      output_.subArgFrom(
+          realOutArg_, /* offset */ idIndex_, idSize_, width, useGpu_);
     }
   }
 }
@@ -160,6 +193,8 @@ void ScatterAgentLayer::forward(PassType passType) {
 void ScatterAgentLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
+  CHECK(!selectionMode_);
+
   const MatrixPtr& outputGrad = realOutArg_.grad;
   const MatrixPtr& realGrad = realLayer_->getOutputGrad();
   if (realGrad) {
@@ -174,42 +209,7 @@ void ScatterAgentLayer::backward(const UpdateCallback& callback) {
 REGISTER_LAYER(gather_agent, GatherAgentLayer);
 REGISTER_LAYER(scatter_agent, ScatterAgentLayer);
 
-void GatherAgentLayer::forwardIds(PassType passType) {
-  int height = 0;
-  IVectorPtr idReal = realLayers_[0]->getOutputLabel();
-
-  if (!idReal) return;
-
-  if (output_.subSequenceStartPositions) {
-    int* starts = output_.subSequenceStartPositions->getMutableData(false);
-    // Gather generator.idsVec
-    // if is beam search generation result. Get first result.
-    if (idReal->getData()[idReal->getSize() - 1] == -1) {
-      for (size_t i = 0; i < realLayers_.size(); ++i) {
-        // The first element stores first result size
-        idReal = realLayers_[i]->getOutputLabel();
-        idReal->subVecFrom(*idReal, 1, idReal->getData()[0]);
-      }
-    }
-    for (size_t i = 0; i < realLayers_.size(); ++i) {
-      CHECK(realLayers_[i]->getOutputLabel());
-      starts[i] = height;
-      height += realLayers_[i]->getOutputLabel()->getSize();
-    }
-    starts[realLayers_.size()] = height;
-    output_.sequenceStartPositions->getMutableData(false)[1] = height;
-
-    IVector::resizeOrCreate(output_.ids, height, false);
-    for (size_t i = 0; i < realLayers_.size(); ++i) {
-      output_.ids->subVec(starts[i], starts[i + 1] - starts[i])
-          ->copyFrom(*realLayers_[i]->getOutputLabel());
-    }
-  } else {
-    LOG(FATAL) << "Not implemented";
-  }
-}
-
-void ScatterAgentLayer::forwardSequence(PassType passType) {
+void ScatterAgentLayer::forwardWithSelection(PassType passType) {
   Layer::forward(passType);
   CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
 
@@ -220,17 +220,19 @@ void ScatterAgentLayer::forwardSequence(PassType passType) {
   AsyncGpuBlock asyncGpuBlock;
   REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());
 
-  if (realOutArg_.value || realOutArg_.ids) {
-    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_,
-                       /* offset */ idIndex_,
-                       idSize_,
-                       width,
-                       useGpu_,
-                       /* trans */ false,
-                       /* seqFlag */ true,
-                       /* seqStart */ seqStartPosIndex_,
-                       /* seqSize */ numSequences_);
+  if (!input.hasSeq()) {
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
+      int height = ids_->getSize();
+      resetOutput(height, width);
+
+      const MatrixPtr& outV = getOutputValue();
+      const MatrixPtr& realV = realLayer_->getOutputValue();
+      outV->selectRows(*realV, *ids_);
+    }
   } else {
     // Putting the generation logic here is really an ugly hack!
     // used in generation
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 461b84b17e556b53e0734bff8e37a0d529a3290e..29681b29c6a9a10715548839f2d365eb4a0c7381 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -110,6 +110,9 @@ protected:
   // of real layer.
   ICpuGpuVectorPtr inputStartPos_;
 
+  // true for setRealLayer, false for setRealLayerAndOutput
+  bool selectionMode_;
+
 public:
   explicit ScatterAgentLayer(const LayerConfig& config) : Layer(config) {}
 
@@ -137,6 +140,7 @@ public:
     } else {
       cpuIds_ = ids_;
     }
+    selectionMode_ = true;
   }
 
   // set real layer and output, [idIndex, idIndex + idSize) of *ids*
@@ -153,6 +157,7 @@ public:
     idIndex_ = idIndex;
     idSize_ = idSize;
     handleBackward_ = handleBackward;
+    selectionMode_ = false;
   }
 
   void setSequenceStartPositions(const ICpuGpuVectorPtr& sequenceStartPositions,
@@ -166,7 +171,7 @@ public:
   void forward(PassType passType) override;
   void backward(const UpdateCallback& callback) override;
 
-  void forwardSequence(PassType passType);
+  void forwardWithSelection(PassType passType);
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 125aaf947f3c9d976b117667d1d1b7700a029cc6..4b92b5d163ad107c0783beae45f8c936112fcccf 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -191,6 +191,11 @@ void Layer::addOutputArgument(int deviceId) {
 void Layer::copyOutputToOtherDevice() {
   for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
     SetDevice device(outputOtherDevice_[i].deviceId);
+    // If outputOtherDevice_[i].value is a CpuMatrix,
+    // the copyFrom is a synchronous interface.
+    // If outputOtherDevice_[i].value is a GpuMatrix, since subsequent
+    // calculations are all on HPPL_STREAM_DEFAULT,
+    // copyFrom can be an asynchronous interface.
     outputOtherDevice_[i].value->copyFrom(*getOutputValue(),
                                           HPPL_STREAM_DEFAULT);
     outputOtherDevice_[i].sequenceStartPositions =
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 4f5fdbb37ce024e18b8d39c5dda74c69bf82166a..93996392d221d531f65caf465decbffdbc2d0384 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -138,6 +138,23 @@ void testEvaluatorAll(TestConfig testConf,
   testEvaluator(testConf, testEvaluatorName, batchSize, false);
 }
 
+TEST(Evaluator, detection_map) {
+  TestConfig config;
+  config.evaluatorConfig.set_type("detection_map");
+  config.evaluatorConfig.set_overlap_threshold(0.5);
+  config.evaluatorConfig.set_background_id(0);
+  config.evaluatorConfig.set_ap_type("Integral");
+  config.evaluatorConfig.set_evaluate_difficult(0);
+
+  config.inputDefs.push_back({INPUT_DATA, "output", 7});
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "label", 6});
+  config.evaluatorConfig.set_evaluate_difficult(false);
+  testEvaluatorAll(config, "detection_map", 100);
+
+  config.evaluatorConfig.set_evaluate_difficult(true);
+  testEvaluatorAll(config, "detection_map", 100);
+}
+
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index f5657c4690ca71200346efd4e2c5244c02c92eb1..9981de61606bda6baac103592125b929d4c12a3d 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -33,7 +33,7 @@ endif()
 add_style_check_target(paddle_math ${MATH_SOURCES})
 add_style_check_target(paddle_math ${MATH_HEADERS})
 
-add_dependencies(paddle_math gen_proto_cpp)  # depends
+add_dependencies(paddle_math paddle_proto ${external_project_dependencies})  # depends
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index c910146164ebfb0737583c72c48ce6dbc5b49939..4431d613f655c1d0c8da13bb5ac9225980c650ad 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1565,6 +1565,8 @@ void CpuMatrix::copyFrom(const Matrix& src, hl_stream_t stream) {
                     const_cast<real*>(src.getData()),
                     sizeof(real) * elementCnt_,
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else if (typeid(src) == typeid(CpuMatrix)) {
     memcpy(data_, src.getData(), sizeof(real) * elementCnt_);
   } else {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 748be850b4c902d1b48c1dafbb0d5ea2bf197e6e..7dfd593225065e18830b2b0c0ce854fe7a2d5178 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -239,7 +239,8 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  // asynchronous copy
+  // For GpuMatrix this is an asynchronous copy interface
+  // For CpuMatrix this is an synchronous copy interface
   virtual void copyFrom(const Matrix& src, hl_stream_t stream) {
     LOG(FATAL) << "Not implemented";
   }
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index c519ca500afb1dbfdff6e8d211786f4e18ccf1fd..eb87ee9bb7936d27c0c32a1a4b35ff49871c0a10 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -657,6 +657,8 @@ void CpuVectorT<T>::copyFrom(const VectorT<T>& src, hl_stream_t stream) {
                     (void*)src.getData(),
                     sizeof(T) * this->getSize(),
                     stream);
+    // There is a need to add synchronization to ensure that the data is copied.
+    hl_stream_synchronize(stream);
   } else {
     src.copyTo(this);
   }
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 9af6e30c9e13895ad95653a787ec1c1ad77a248f..80b9775fccf10c57bb48145ef56165ec7c86d8b8 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -168,11 +168,11 @@ public:
   virtual void copyFrom(const VectorT<T>& src) = 0;
 
   /**
-   * If use_gpu, this function will push the copy-task to the specifed-stream
-   * and return immediately.
+   * If GpuVector, this function is an asynchronous interface,
+   * will push the copy-task to the specifed-stream and return immediately.
    *
-   * If not use GPU, this function is same as
-   * the copyFrom(const VectorT<T>& src), which use stream HPPL_STREAM_DEFAULT.
+   * If CpuVector, this function is an synchronous interface,
+   * same as the copyFrom(const VectorT<T>& src).
    */
   virtual void copyFrom(const VectorT<T>& src, hl_stream_t stream) = 0;
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 5a0dffe086c4e265d17c79dba435b66c0873e3c7..354f58df39365410ff9aec2576c768e58db9e0d2 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -1127,4 +1127,18 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
+TEST(CpuMatrix, copyFrom) {
+  const size_t height = 1000;
+  const size_t width = 1000;
+  CpuMatrix cpu(height, width);
+  GpuMatrix gpu(height, width);
+  CpuMatrix copy(height, width);
+
+  cpu.randomizeUniform();
+  gpu.copyFrom(cpu);
+  copy.copyFrom(gpu, HPPL_STREAM_DEFAULT);
+
+  TensorCheckEqual(cpu, copy);
+}
+
 #endif
diff --git a/paddle/memory/.clang-format b/paddle/memory/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..29282dc87e2c499988c17d90d47d44cd5cf7f115
--- /dev/null
+++ b/paddle/memory/.clang-format
@@ -0,0 +1,5 @@
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3943c3cfad31d13a00645aba6fc153d3d13da987
--- /dev/null
+++ b/paddle/memory/CMakeLists.txt
@@ -0,0 +1 @@
+add_subdirectory(detail)
diff --git a/paddle/memory/README.md b/paddle/memory/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96a331a486f57d3e030408fee182199bad5b38c2
--- /dev/null
+++ b/paddle/memory/README.md
@@ -0,0 +1,140 @@
+## Design
+
+### Usage
+
+To allocate 4KB CPU memory:
+
+```cpp
+p = memory::Alloc(platform::CPUPlace(), 4*1024);
+```
+
+To allocate 4KB memory on the 3rd GPU:
+
+```cpp
+p = memory::Alloc(platform::GPUPlace(2), 4*1024);
+```
+
+To free memory and check the so-far used amount of memory on a place:
+
+```cpp
+auto pl = platform::GPUPlace(0);
+p = memory::Alloc(pl, 4*1024);
+cout << memory::Used(pl);
+memory::Free(pl, p);
+```
+
+### API
+
+In `paddle/memory/memory.h` we have:
+
+```cpp
+namespace memory {
+template <typename Place> void* Alloc(Place, size_t);
+template <typename Place> void Free(Place, void*);
+template <typename Place> size_t Used(Place);
+}  // namespace memory
+```
+
+These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
+
+```cpp
+template<>
+void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+```
+
+and 
+
+```cpp
+template<>
+void Alloc<GPUPlace>(GPUPlace p, size_t size) {
+  return GetGPUBuddyAllocator(p.id)->Alloc(size);
+}
+```
+
+Similar specializations exist for `Free` and `Used`.
+
+### Implementation
+
+`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
+
+```cpp
+BuddyAllocator* GetCPUBuddyAllocator() {
+  static BuddyAllocator* a = NULL;
+  if (a == NULL) {
+    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
+  }
+  return a;
+}
+
+BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator* as = NULL;
+  if (as == NULL) {
+    as = new BuddyAllocator*[platform::NumGPUs()];
+    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
+      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
+    }
+  }
+  return as[gpu_id);
+```
+
+#### `BuddyAllocator`
+
+`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
+
+```cpp
+BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
+  ...
+}
+```
+
+Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
+
+```cpp
+class BuddyAllocator {
+ private:
+  struct Block {
+    size_t size;
+    Block* left, right;
+    size_t index; // allocator id
+  };
+  ...
+};
+```
+
+Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
+
+#### System Allocators
+
+The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
+
+## Justification
+
+I got inspiration from Majel and Caffe2, though above design look different from both.
+
+### Caffe2
+
+In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
+
+There are two implementations of `Context`:
+
+1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
+
+1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
+
+### Majel
+
+In Majel, there are basically two allocator types:
+
+1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
+1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
+
+However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
+
+In Majel there are hidden global variables like:
+
+1. `cpu::SystemAllocator g_cpu_allocator`, and
+1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
+
+Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..72d3749ad789eca9a4b10944131171c0cf8dfe5a
--- /dev/null
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -0,0 +1,7 @@
+if(${WITH_GPU})
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+else(${WITH_GPU})
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ebe680f5eea4948339fb8c5584a5b9f5d71c752e
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/buddy_allocator.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
+                               SystemAllocator* system_allocator)
+    : pool_size_(pool_size),
+      max_pools_(max_pools),
+      system_allocator_(system_allocator) {
+  PADDLE_ASSERT(pool_size > 0);
+  PADDLE_ASSERT(max_pools > 0);
+  PADDLE_ASSERT(system_allocator != nullptr);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/buddy_allocator.h b/paddle/memory/detail/buddy_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..82e6aaedc719966b4074449ce1ef7193c73dc265
--- /dev/null
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <mutex>
+#include <vector>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class BuddyAllocator {
+ public:
+  BuddyAllocator(size_t pool_size, size_t max_pools,
+                 SystemAllocator* system_allocator);
+  ~BuddyAllocator();
+
+  void* Alloc(size_t size);
+  void Free(void*);
+  size_t Used();
+
+ private:
+  struct Block {
+    size_t size_;
+    Block* left_;   // left buddy
+    Block* right_;  // right buddy
+  };
+
+  // Initially, there is only one pool.  If a Alloc founds not enough
+  // memory from that pool, and there has not been max_num_pools_,
+  // create a new pool by calling system_allocator_.Alloc(pool_size_).
+  std::vector<void*> pools_;
+
+  size_t pool_size_;      // the size of each pool;
+  size_t max_num_pools_;  // the size of all pools;
+
+  SystemAllocator* system_allocator_;
+
+  std::mutex mutex_;
+
+  // Disable copy and assignment.
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+};
+
+BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
+  static BuddyAllocator<CPUAllocator>* a = nullptr;
+  if (a == nullptr) {
+    a = new BuddyAllocator<CPUAllocator>();
+  }
+  return a;
+}
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
+  static BuddyAllocator<GPUAllocator>** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount();
+    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      as[gpu] = new BuddyAllocator<GPUAllocator>();
+    }
+  }
+  return as[gpu_id];
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.cc b/paddle/memory/detail/system_allocator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..50bec926f83dee8a4343d0b16aeb088f9d2a4871
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <stdlib.h>    // for malloc and free
+#include <sys/mman.h>  // for mlock and munlock
+
+#include "gflags/gflags.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cuda.h"
+
+// If use_pinned_memory is true, CPUAllocator calls mlock, which
+// returns pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the amount
+// of memory available to the system for paging.  So, by default, we
+// should set false to use_pinned_memory.
+DEFINE_bool(use_pinned_memory, false,
+            "If set, allocate cpu/gpu pinned memory.");
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void* CPUAllocator::Alloc(size_t size) {
+  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
+  // malloc might not return nullptr if size is zero, but the returned
+  // pointer shall not be dereferenced -- so we make it nullptr.
+  if (size <= 0) return nullptr;
+
+  void* p = malloc(size);
+  if (p != nullptr && FLAGS_use_pinned_memory) {
+    mlock(p, size);
+  }
+  return p;
+}
+
+void CPUAllocator::Free(void* p, size_t size) {
+  if (p != nullptr && FLAGS_use_pinned_memory) {
+    munlock(p, size);
+  }
+  free(p);
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+void* GPUAllocator::Alloc(size_t size) {
+  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
+  // if size is 0.  We just make sure it does.
+  if (size <= 0) {
+    return nullptr;
+  }
+
+  void* p = 0;
+  cudaError_t result =
+      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+  if (result != cudaSuccess) {
+    cudaGetLastError();  // clear error if there is any.
+  }
+  return result == cudaSuccess ? p : nullptr;
+}
+
+void GPUAllocator::Free(void* p, size_t size) {
+  // Purposefully allow cudaErrorCudartUnloading, because
+  // that is returned if you ever call cudaFree after the
+  // driver has already shutdown. This happens only if the
+  // process is terminating, in which case we don't care if
+  // cudaFree succeeds.
+  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
+  if (err != cudaErrorCudartUnloading) {
+    platform::throw_on_error(err, "cudaFree{Host} failed");
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..184b383f7f78244fa6632a3bffb1a0a78b3aa664
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>  // for size_t
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// SystemAllocator is the parent class of CPUAllocator and
+// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
+// pointing to the underlying system allocator.  An alternative to
+// this class hierarchy is to pass a system allocator class to
+// BuddyAllocator as a template parameter.  This approach makes
+// BuddyAllocator a class template, and it's very complicated
+// algorithm would make the buddy_allocator.h messy.
+class SystemAllocator {
+ public:
+  virtual ~SystemAllocator() {}
+  virtual void* Alloc(size_t size) = 0;
+  virtual void Free(void* p, size_t size) = 0;
+};
+
+class CPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size);
+  virtual void Free(void* p, size_t size);
+};
+
+#ifndef PADDLE_ONLY_CPU
+class GPUAllocator : public SystemAllocator {
+ public:
+  virtual void* Alloc(size_t size);
+  virtual void Free(void* p, size_t size);
+};
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9bd5706a4e4d1546a8c879ebbac0f3349c9d59f6
--- /dev/null
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DECLARE_bool(use_pinned_memory);
+
+void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
+  bool freed = false;
+  {
+    void* p = a.Alloc(size);
+    if (size > 0) {
+      EXPECT_NE(p, nullptr);
+    } else {
+      EXPECT_EQ(p, nullptr);
+    }
+
+    int* i = static_cast<int*>(p);
+    std::shared_ptr<int> ptr(i, [&](void* p) {
+      freed = true;
+      a.Free(p, size);
+    });
+  }
+  EXPECT_TRUE(freed);
+}
+
+TEST(CPUAllocator, NoLockMem) {
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+
+TEST(CPUAllocator, LockMem) {
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::CPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(GPUAllocator, NoStaging) {
+  FLAGS_use_pinned_memory = false;
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+TEST(GPUAllocator, Staging) {
+  FLAGS_use_pinned_memory = true;
+  paddle::memory::detail::GPUAllocator a;
+  TestAllocator(a, 2048);
+  TestAllocator(a, 0);
+}
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d123d99e234a378ee64850eebacece223e2b121
--- /dev/null
+++ b/paddle/memory/memory.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/buddy_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+
+#include <boost/variant.hpp>
+
+namespace paddle {
+namespace memory {
+
+void* Alloc(platform::Place pl, size_t size) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  return detail::GetCPUBuddyAllocator()->Alloc(size);
+}
+
+void Free(paddle::platform::Place pl, void* p) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  detail::GetCPUBuddyAllocator()->Free(p);
+}
+
+size_t Used(paddle::platform::Place pl) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
+    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  return detail::GetCPUBuddyAllocator()->Used();
+}
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..a33092bade65e6df0faee226a8967c9fc9caa032
--- /dev/null
+++ b/paddle/memory/memory.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+void* Alloc(paddle::platform::Place, size_t);
+void Free(paddle::platform::Place, void*);
+size_t Used(paddle::platform::Place);
+
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/optimizer/CMakeLists.txt b/paddle/optimizer/CMakeLists.txt
index 35f04789cfe6dc445973f0f922269f6f78b713a3..926fee47e1f86efa60dc40a2727edb06499bec4f 100644
--- a/paddle/optimizer/CMakeLists.txt
+++ b/paddle/optimizer/CMakeLists.txt
@@ -10,7 +10,7 @@ set(OPITMIZER_SRCS
   )
 
 add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
-add_dependencies(paddle_optimizer gen_proto_cpp)
+add_dependencies(paddle_optimizer paddle_proto ${external_project_dependencies})
 
 
 if(WITH_TESTING)
diff --git a/paddle/parameter/CMakeLists.txt b/paddle/parameter/CMakeLists.txt
index a35e46997fb04e9378e106bf428a629b286c2e8c..d2ae1c16c6b7316f1a6facdef4b933693d6ba818 100644
--- a/paddle/parameter/CMakeLists.txt
+++ b/paddle/parameter/CMakeLists.txt
@@ -7,7 +7,7 @@ add_library(paddle_parameter STATIC
         ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_SOURCES})
 add_style_check_target(paddle_parameter ${PARAMETERS_HEADERS})
-add_dependencies(paddle_parameter gen_proto_cpp)
+add_dependencies(paddle_parameter paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index f826e8448c666bb3305c150f2bd95aade23223fb..c8b47687f5d3c00f6609b858103a5fec526b970a 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -14,11 +14,13 @@ limitations under the License. */
 
 #include "ParameterUpdaterHook.h"
 
+#include <algorithm>
 #include <atomic>
 #include <fstream>
 #include <mutex>
 #include <thread>
 #include <unordered_map>
+#include <vector>
 
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
@@ -29,106 +31,76 @@ namespace paddle {
 
 /**
  * The static pruning hook
- *
- * Static means user load a mask map before training started. This map will
- * define which link/weight between neural is disabled.
+ * Static means user specify a sparsity_ratio before training started, and the
+ * network will prune the parameters based on the sparsity_ratio. More details
+ * can be found https://arxiv.org/pdf/1506.02626.pdf.
  */
+
 class StaticPruningHook : public IParameterUpdaterHook {
 public:
-  /**
-   * The Mask Map Header.
-   * The map file started with this header.
-   *
-   * In Version 0, reset file will be:
-   *  contains header.size bit, each bit means such weight is enabled or not.
-   *    if bit is 1, then such weight is enabled.
-   *  at end, the file will round to byte, and the low bits of end byte will be
-   *  filled by zero.
-   *
-   */
-  struct StaticMaskHeader {
-    uint32_t version;
-    size_t size;
-  } __attribute__((__packed__));
-
-  explicit StaticPruningHook(const std::string& mask_filename) : initCount_(0) {
-    bool ok = this->loadMaskFile(mask_filename);
-    if (!ok) {
-      LOG(WARNING) << "Fail to load mask file " << mask_filename
-                   << " in current directory, searching in init_model_path";
-      std::string combineMaskFilename =
-          path::join(FLAGS_init_model_path, mask_filename);
-      CHECK(this->loadMaskFile(combineMaskFilename))
-          << "Cannot load " << mask_filename << " in ./" << mask_filename
-          << " and " << combineMaskFilename;
-    }
-    VLOG(3) << mask_filename << " mask size = " << this->mask_.size();
+  explicit StaticPruningHook(const ParameterUpdaterHookConfig &hookConfig)
+      : initCount_(0) {
+    sparsityRatio_ = hookConfig.sparsity_ratio();
   }
 
-  void update(Parameter* para) {
+  static bool sortPairAscend(const std::pair<real, size_t> &pair1,
+                             const std::pair<real, size_t> &pair2) {
+    return pair1.first > pair2.first;
+  }
+
+  void update(Parameter *para) {
     updateThreadChecker_.check();
-    auto& vec = para->getBuf(PARAMETER_GRADIENT);
+    auto &vec = para->getBuf(PARAMETER_GRADIENT);
     if (vec) {
       vec->dotMul(*maskVec_);
     }
   }
 
-  void init(Parameter* para) {
-    size_t initCount = this->initCount_.fetch_add(1);
-    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
-                                "in same ParamterUpdater";
-    VLOG(3) << "Initialize Parameter " << para;
-    SetDevice device(para->getDeviceId());
+  void generateMask(Parameter *para) {
+    VectorPtr maskTemp = Vector::create(para->getSize(), false);
+    maskTemp->zeroMem();
+    real *maskTempData = maskTemp->getData();
+    size_t nonZeroNum = para->getSize() * (1 - sparsityRatio_);
 
-    auto maskVec = Vector::create(this->mask_.size(), false);
-    {  // Initialize maskVec with float mask vector
-      real* dataPtr = maskVec->getData();
-      size_t i = 0;
-      for (bool m : mask_) {
-        dataPtr[i++] = m ? 1.0 : 0.0;
-      }
-    }
+    VectorPtr paraVec = para->getBuf(PARAMETER_VALUE);
+    VectorPtr paraCpuCopy = Vector::create(para->getSize(), false);
+
+    paraCpuCopy->copyFrom(*paraVec);
+    std::vector<std::pair<real, size_t>> param;
+
+    for (size_t i = 0; i < para->getSize(); i++)
+      param.push_back(std::make_pair(fabs(paraCpuCopy->getData()[i]), i));
+
+    std::partial_sort(
+        param.begin(), param.begin() + nonZeroNum, param.end(), sortPairAscend);
+    for (size_t i = 0; i < nonZeroNum; i++) maskTempData[param[i].second] = 1.0;
 
     // Currently just use a mask vector for hack.
-    // @TODO(yuyang18): Implemented the mask operation in vector.
     if (para->useGpu()) {
-      maskVec_ = Vector::create(this->mask_.size(), para->useGpu());
-      maskVec_->copyFrom(*maskVec);
+      maskVec_ = Vector::create(para->getSize(), para->useGpu());
+      maskVec_->copyFrom(*maskTemp);
     } else {
-      maskVec_ = maskVec;
+      maskVec_ = maskTemp;
     }
-
-    auto& vec = para->getBuf(PARAMETER_VALUE);
-    vec->dotMul(*maskVec_);
   }
 
-private:
-  bool loadMaskFile(const std::string& mask_filename) {
-    std::ifstream fin;
-    fin.open(mask_filename);
-    if (fin.is_open()) {
-      StaticMaskHeader header;
-      fin.read(reinterpret_cast<char*>(&header), sizeof(StaticMaskHeader));
-      CHECK_EQ(header.version, 0UL);
-      mask_.resize(header.size);
-      uint8_t buf;
-      for (size_t i = 0; i < header.size; ++i, buf <<= 1) {
-        if (i % 8 == 0) {
-          fin.read(reinterpret_cast<char*>(&buf), sizeof(uint8_t));
-        }
-        mask_[i] = buf & 0x80;
-      }
-      fin.close();
-      return true;
-    } else {
-      return false;
-    }
+  void init(Parameter *para) {
+    generateMask(para);
+    size_t initCount = this->initCount_.fetch_add(1);
+    CHECK_EQ(initCount, 0UL) << "Currently the StaticPruningHook must invoke "
+                                "in same ParamterUpdater";
+    VLOG(3) << "Initialize Parameter " << para;
+    SetDevice device(para->getDeviceId());
+
+    auto &paraVec = para->getBuf(PARAMETER_VALUE);
+    paraVec->dotMul(*maskVec_);
   }
 
+private:
   SameThreadChecker updateThreadChecker_;
   std::atomic<size_t> initCount_;
   VectorPtr maskVec_;
-  std::vector<bool> mask_;
+  real sparsityRatio_;
 };
 
 IParameterUpdaterHook::IParameterUpdaterHook() {}
@@ -145,7 +117,7 @@ IParameterUpdaterHook::~IParameterUpdaterHook() {}
  */
 class StringIntPairHasher {
 public:
-  size_t operator()(const std::pair<std::string, int>& k) const {
+  size_t operator()(const std::pair<std::string, int> &k) const {
     return intHasher_(strHasher_(k.first) + k.second);
   }
 
@@ -162,19 +134,19 @@ static WeakKVCache<std::pair<std::string, int>,
 /**
  * ParameterUpdaterHook actually factory method.
  */
-static IParameterUpdaterHook* createImpl(
-    const ParameterUpdaterHookConfig& config) {
-  auto& type = config.type();
+static IParameterUpdaterHook *createImpl(
+    const ParameterUpdaterHookConfig &config) {
+  auto &type = config.type();
   if (type == "pruning") {
-    if (config.has_purning_mask_filename()) {
-      return new StaticPruningHook(config.purning_mask_filename());
-    }
+    return new StaticPruningHook(config);
   }
+
+  LOG(FATAL) << "Unknown Hook type:  " << type;
   return nullptr;
 }
 
 std::shared_ptr<IParameterUpdaterHook> IParameterUpdaterHook::create(
-    const ParameterConfig& paramConfig, int idx) {
+    const ParameterConfig &paramConfig, int idx) {
   std::pair<std::string, int> key = {paramConfig.name(), idx};
   return g_hookCache_.get(
       key, [&] { return createImpl(paramConfig.update_hooks(idx)); });
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..8fe891f9ce6c3add1df48a8b1f79fd811c7a4362
--- /dev/null
+++ b/paddle/platform/cuda.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+namespace paddle {
+namespace platform {
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+int GetDeviceCount(void) {
+  int count;
+  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
+  return count;
+}
+
+}  // namespace platform
+}  // namespace paddle
+
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/platform/place.cc b/paddle/platform/place.cc
index 1afd03c01169d395b086c1da458ce25c66a12a51..0704820aa05079401eb56814d689d6e280311edb 100644
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
@@ -8,8 +8,8 @@ namespace detail {
 class PlacePrinter : public boost::static_visitor<> {
  public:
   PlacePrinter(std::ostream &os) : os_(os) {}
-  void operator()(const CpuPlace &) { os_ << "CpuPlace"; }
-  void operator()(const GpuPlace &p) { os_ << "GpuPlace(" << p.device << ")"; }
+  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
+  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }
 
  private:
   std::ostream &os_;
@@ -22,14 +22,14 @@ static Place the_default_place;
 void set_place(const Place &place) { the_default_place = place; }
 const Place &get_place() { return the_default_place; }
 
-const GpuPlace default_gpu() { return GpuPlace(0); }
-const CpuPlace default_cpu() { return CpuPlace(); }
+const GPUPlace default_gpu() { return GPUPlace(0); }
+const CPUPlace default_cpu() { return CPUPlace(); }
 
 bool is_gpu_place(const Place &p) {
-  return boost::apply_visitor(IsGpuPlace(), p);
+  return boost::apply_visitor(IsGPUPlace(), p);
 }
 bool is_cpu_place(const Place &p) {
-  return !boost::apply_visitor(IsGpuPlace(), p);
+  return !boost::apply_visitor(IsGPUPlace(), p);
 }
 
 bool places_are_same_class(const Place &p1, const Place &p2) {
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index 489572c526e162500c8f747f0ec8df10da9d86a2..7cead183884bc9379355cd931921b40d6c11ce90 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -1,43 +1,58 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #pragma once
+
 #include <boost/variant.hpp>
 #include <iostream>
 
 namespace paddle {
 namespace platform {
 
-struct CpuPlace {
+struct CPUPlace {
   // WORKAROUND: for some reason, omitting this constructor
   // causes errors with boost 1.59 and OSX
-  CpuPlace() {}
+  CPUPlace() {}
 
   // needed for variant equality comparison
-  inline bool operator==(const CpuPlace &) const { return true; }
-  inline bool operator!=(const CpuPlace &) const { return false; }
+  inline bool operator==(const CPUPlace &) const { return true; }
+  inline bool operator!=(const CPUPlace &) const { return false; }
 };
 
-struct GpuPlace {
-  GpuPlace() : GpuPlace(0) {}
-  GpuPlace(int d) : device(d) {}
+struct GPUPlace {
+  GPUPlace() : GPUPlace(0) {}
+  GPUPlace(int d) : device(d) {}
 
   // needed for variant equality comparison
-  inline bool operator==(const GpuPlace &o) const { return device == o.device; }
-  inline bool operator!=(const GpuPlace &o) const { return !(*this == o); }
+  inline bool operator==(const GPUPlace &o) const { return device == o.device; }
+  inline bool operator!=(const GPUPlace &o) const { return !(*this == o); }
 
   int device;
 };
 
-struct IsGpuPlace : public boost::static_visitor<bool> {
-  bool operator()(const CpuPlace &) const { return false; }
-  bool operator()(const GpuPlace &gpu) const { return true; }
+struct IsGPUPlace : public boost::static_visitor<bool> {
+  bool operator()(const CPUPlace &) const { return false; }
+  bool operator()(const GPUPlace &gpu) const { return true; }
 };
 
-typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<GPUPlace, CPUPlace> Place;
 
 void set_place(const Place &);
 const Place &get_place();
 
-const GpuPlace default_gpu();
-const CpuPlace default_cpu();
+const GPUPlace default_gpu();
+const CPUPlace default_cpu();
 
 bool is_gpu_place(const Place &);
 bool is_cpu_place(const Place &);
diff --git a/paddle/platform/place_test.cc b/paddle/platform/place_test.cc
index 73fccceedf6918148a26100f64cf322305c3ac20..33e2e5a439ce6801c02daba4bcbd462a74d7a614 100644
--- a/paddle/platform/place_test.cc
+++ b/paddle/platform/place_test.cc
@@ -3,8 +3,8 @@
 #include "gtest/gtest.h"
 
 TEST(Place, Equality) {
-  paddle::platform::CpuPlace cpu;
-  paddle::platform::GpuPlace g0(0), g1(1), gg0(0);
+  paddle::platform::CPUPlace cpu;
+  paddle::platform::GPUPlace g0(0), g1(1), gg0(0);
 
   EXPECT_EQ(cpu, cpu);
   EXPECT_EQ(g0, g0);
@@ -22,19 +22,19 @@ TEST(Place, Default) {
   EXPECT_TRUE(paddle::platform::is_gpu_place(paddle::platform::default_gpu()));
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::default_cpu()));
 
-  paddle::platform::set_place(paddle::platform::CpuPlace());
+  paddle::platform::set_place(paddle::platform::CPUPlace());
   EXPECT_TRUE(paddle::platform::is_cpu_place(paddle::platform::get_place()));
 }
 
 TEST(Place, Print) {
   {
     std::stringstream ss;
-    ss << paddle::platform::GpuPlace(1);
-    EXPECT_EQ("GpuPlace(1)", ss.str());
+    ss << paddle::platform::GPUPlace(1);
+    EXPECT_EQ("GPUPlace(1)", ss.str());
   }
   {
     std::stringstream ss;
-    ss << paddle::platform::CpuPlace();
-    EXPECT_EQ("CpuPlace", ss.str());
+    ss << paddle::platform::CPUPlace();
+    EXPECT_EQ("CPUPlace", ss.str());
   }
 }
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index b7f85ea1a6dfda2a37c315ba15c6ca1979cf4131..2245c7d88ca74922f9919db91977dfa6cb3ca468 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_network STATIC
 add_style_check_target(paddle_network ${NETWORK_SOURCES})
 add_style_check_target(paddle_network ${NETWORK_HEADERS})
 
-add_dependencies(paddle_network gen_proto_cpp)
+add_dependencies(paddle_network paddle_proto ${external_project_dependencies})
 
 ################### paddle_pserver ######################
 set(PSERVER_SOURCES
@@ -40,7 +40,7 @@ add_library(paddle_pserver STATIC
 add_style_check_target(paddle_pserver ${PSERVER_SOURCES})
 add_style_check_target(paddle_pserver ${PSERVER_HEADERS})
 
-add_dependencies(paddle_pserver gen_proto_cpp)
+add_dependencies(paddle_pserver paddle_proto ${external_project_dependencies})
 
 set(PSERVER_MAIN_SOURCES
     ParameterServer2Main.cpp)
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index edc2e0292378fea0cd904d7f017762c1dade6caf..43614b9779d21795f1f274589ea93639e923ce75 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -109,6 +109,10 @@ class DenseScanner(IScanner):
             if len(self.__shape__) > 3:
                 raise ValueError(
                     "The dimension of input cannot be greater than 3.")
+            if len(self.__shape__) == 0:
+                raise ValueError(
+                    "The input should be a vector, please check your input data."
+                )
             self.__dim__ = reduce(lambda x, y: x * y, self.__shape__)
             if len(self.__shape__) == 1 and self.__dim__ != self.input_type.dim:
                 raise ValueError(
@@ -140,7 +144,7 @@ class DenseScanner(IScanner):
         if len(self.__shape__) > 1:
             # The last-two dimenstions are the frame height and width.
             # For example, the layout is CHW for 3-D feature of image.
-            # The H and W are the fram height and width.
+            # The H and W are the frame height and width.
             h, w = self.__shape__[-2:]
             argument.setSlotFrameHeight(self.pos, h)
             argument.setSlotFrameWidth(self.pos, w)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 2b48e4dc0f875be9a87797fa14885926999a5010..a182e5f4aef9de8c6f20681328d5ba6c0e6944ef 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -31,6 +31,7 @@ Configuring cmake in /paddle/build ...
       -DWITH_DOC=OFF
       -DWITH_GPU=${WITH_GPU:-OFF}
       -DWITH_AVX=${WITH_AVX:-OFF}
+      -DWITH_GOLANG=${WITH_GOLANG:-OFF}
       -DWITH_SWIG_PY=ON
       -DCUDNN_ROOT=/usr/
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
@@ -43,6 +44,7 @@ cmake .. \
       -DWITH_DOC=OFF \
       -DWITH_GPU=${WITH_GPU:-OFF} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
+      -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
       -DWITH_SWIG_PY=ON \
       -DCUDNN_ROOT=/usr/ \
       -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
deleted file mode 100755
index f2cbc561652a3c7502de94be37d75783fc40b9c1..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_and_test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-source ./common.sh
-
-NPROC=1
-export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
-export PYTHONHOME=/opt/python/2.7.12
-export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
-NRPOC=`nproc`
-make -j $NPROC
-make coveralls
-sudo make install
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/build_doc.sh
similarity index 84%
rename from paddle/scripts/travis/docs.sh
rename to paddle/scripts/travis/build_doc.sh
index c784293695bf134b5e990639778b6e84ba45d00d..a44bd35357fde41c379134bed6b7fb242efe49e5 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -1,15 +1,19 @@
 #!/bin/bash
+set -e
+
+# Create the build directory for CMake.
+mkdir -p $TRAVIS_BUILD_DIR/build
+cd $TRAVIS_BUILD_DIR/build
 
-# Add set -e, cd to directory.
-source ./common.sh
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
+
 mkdir output
 make -j `nproc`
 find .. -name '*whl' | xargs pip install  # install all wheels.
 rm -rf *
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
-make paddle_docs paddle_docs_cn
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
 linkchecker doc/en/html/index.html
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/check_style.sh
similarity index 54%
rename from paddle/scripts/travis/precommit.sh
rename to paddle/scripts/travis/check_style.sh
index 7a59b1131d0a410be9c5cef08e3cc11633d2ba67..4754bdd4c80de9700d92b0e33ecfdfc582f42813 100755
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -1,14 +1,14 @@
 #!/bin/bash
 function abort(){
-    echo "Your commit not fit PaddlePaddle code style" 1>&2
-    echo "Please use pre-commit scripts to auto-format your code" 1>&2
+    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+    echo "Please use pre-commit to reformat your code and git push again." 1>&2
     exit 1
 }
 
 trap 'abort' 0
 set -e
-source common.sh
-cd ..
+
+cd $TRAVIS_BUILD_DIR
 export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
deleted file mode 100755
index f05c7530a3b0632948e4b18c477d6dc6aad04c03..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/common.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-mkdir -p ../../../build
-cd ../../../build
-mkdir -p $HOME/third_party
-EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
deleted file mode 100755
index 13f2552d29db38041a73edca0acd202945c67484..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/main.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-cd `dirname $0`
-
-if [ ${JOB} == "BUILD_AND_TEST" ]; then
-  ./build_and_test.sh
-elif [ ${JOB} == "DOCS" ]; then
-  ./docs.sh
-elif [ ${JOB} == "PRE_COMMIT" ]; then
-  ./precommit.sh
-else
-  echo Unknown job ${JOB}
-  exit 1
-fi
diff --git a/paddle/string/CMakeLists.txt b/paddle/string/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5becf62672d0c606c98ea1a1a4383df97088ab05
--- /dev/null
+++ b/paddle/string/CMakeLists.txt
@@ -0,0 +1,4 @@
+cc_library(stringpiece SRCS piece.cc)
+cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)
+
+cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
diff --git a/paddle/string/piece.cc b/paddle/string/piece.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b80afdec82d642fd3a8245b96ce1bb2bea17cbae
--- /dev/null
+++ b/paddle/string/piece.cc
@@ -0,0 +1,138 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/string/piece.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <iosfwd>
+#include <stdexcept>
+
+namespace paddle {
+namespace string {
+
+Piece::Piece() : data_(NULL), size_(0) {}
+
+Piece::Piece(const char* d, size_t n) : data_(d), size_(n) {
+  if (d == NULL && n != 0)
+    throw std::invalid_argument("Piece requires len to be 0 for NULL data");
+}
+
+Piece::Piece(const char* s) : data_(s) { size_ = (s == NULL) ? 0 : strlen(s); }
+
+Piece::Piece(const std::string& s) : data_(s.data()), size_(s.size()) {}
+
+char Piece::operator[](size_t n) const {
+  if (n >= len()) throw std::invalid_argument("index out of Piece length");
+  return data_[n];
+}
+
+int Compare(Piece a, Piece b) {
+  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
+  int r = memcmp(a.data(), b.data(), min_len);
+  if (r == 0) {
+    if (a.len() < b.len())
+      return -1;
+    else if (a.len() > b.len())
+      return 1;
+  }
+  return r;
+}
+
+bool operator==(Piece x, Piece y) {
+  return ((x.len() == y.len()) &&
+          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
+}
+
+bool operator!=(Piece x, Piece y) { return !(x == y); }
+
+bool operator<(Piece x, Piece y) { return Compare(x, y) < 0; }
+bool operator>(Piece x, Piece y) { return Compare(x, y) > 0; }
+
+bool operator<=(Piece x, Piece y) { return Compare(x, y) <= 0; }
+bool operator>=(Piece x, Piece y) { return Compare(x, y) >= 0; }
+
+bool HasPrefix(Piece s, Piece x) {
+  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
+}
+
+bool HasSuffix(Piece s, Piece x) {
+  return ((s.len() >= x.len()) &&
+          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
+}
+
+Piece SkipPrefix(Piece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than Piece length");
+  return Piece(s.data() + n, s.len() - n);
+}
+
+Piece SkipSuffix(Piece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than Piece length");
+  return Piece(s.data(), s.len() - n);
+}
+
+Piece TrimPrefix(Piece s, Piece x) {
+  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
+}
+
+Piece TrimSuffix(Piece s, Piece x) {
+  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
+}
+
+bool Contains(Piece s, Piece sub) {
+  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
+}
+
+size_t Index(Piece s, Piece sub) {
+  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
+  return e != s.end() ? e - s.data() : Piece::npos;
+}
+
+size_t Find(Piece s, char c, size_t pos) {
+  if (pos >= s.len()) {
+    return Piece::npos;
+  }
+  const char* result =
+      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
+  return result != nullptr ? result - s.data() : Piece::npos;
+}
+
+size_t RFind(Piece s, char c, size_t pos) {
+  if (s.len() == 0) return Piece::npos;
+  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
+       p--) {
+    if (*p == c) {
+      return p - s.data();
+    }
+  }
+  return Piece::npos;
+}
+
+Piece SubStr(Piece s, size_t pos, size_t n) {
+  if (pos > s.len()) pos = s.len();
+  if (n > s.len() - pos) n = s.len() - pos;
+  return Piece(s.data() + pos, n);
+}
+
+std::ostream& operator<<(std::ostream& o, Piece piece) {
+  return o << piece.ToString();
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/strings/stringpiece.h b/paddle/string/piece.h
similarity index 57%
rename from paddle/strings/stringpiece.h
rename to paddle/string/piece.h
index adff713e86f49349b8f189c1d24584bfc1bb8aa7..db7c3e69804a6a8f0510ba376432fe560ae74442 100644
--- a/paddle/strings/stringpiece.h
+++ b/paddle/string/piece.h
@@ -20,33 +20,34 @@
 #include <string>
 
 namespace paddle {
+namespace string {
 
-// StringPiece points into a std::string object but doesn't own the
+// Piece points into a std::string object but doesn't own the
 // string.  It is for efficient access to strings.  Like Go's string
-// type.  Not that StringPiece doesn't mutate the underlying string,
+// type.  Not that Piece doesn't mutate the underlying string,
 // so it is thread-safe given that the underlying string doesn't
-// change.  Because StringPiece contains a little data members, and
+// change.  Because Piece contains a little data members, and
 // its syntax is simple as it doesn't own/manage the string, it is
-// cheap to construct StringPieces and pass them around.
-class StringPiece {
+// cheap to construct Pieces and pass them around.
+class Piece {
 public:
   static const size_t npos = static_cast<size_t>(-1);
 
   // We provide non-explicit singleton constructors so users can
-  // pass in a "const char*" or a "string" wherever a "StringPiece"
+  // pass in a "const char*" or a "string" wherever a "Piece"
   // is expected.  These contructors ensure that if data_ is NULL,
   // size_ is 0.
-  StringPiece();
-  StringPiece(const char* d, size_t n);
-  StringPiece(const char* d);
-  StringPiece(const std::string& s);
+  Piece();
+  Piece(const char* d, size_t n);
+  Piece(const char* d);
+  Piece(const std::string& s);
 
   const char* data() const { return data_; }
   size_t len() const { return size_; }
 
   char operator[](size_t n) const;
 
-  // StringPiece doesn't own the string, so both iterator and const
+  // Piece doesn't own the string, so both iterator and const
   // iterator are const char* indeed.
   typedef const char* const_iterator;
   typedef const char* iterator;
@@ -63,43 +64,44 @@ private:
   // Intentionally copyable
 };
 
-int Compare(StringPiece a, StringPiece b);
+int Compare(Piece a, Piece b);
 
-bool operator==(StringPiece x, StringPiece y);
-bool operator!=(StringPiece x, StringPiece y);
-bool operator<(StringPiece x, StringPiece y);
-bool operator>(StringPiece x, StringPiece y);
-bool operator<=(StringPiece x, StringPiece y);
-bool operator>=(StringPiece x, StringPiece y);
+bool operator==(Piece x, Piece y);
+bool operator!=(Piece x, Piece y);
+bool operator<(Piece x, Piece y);
+bool operator>(Piece x, Piece y);
+bool operator<=(Piece x, Piece y);
+bool operator>=(Piece x, Piece y);
 
-bool HasPrefix(StringPiece s, StringPiece prefix);
-bool HasSuffix(StringPiece s, StringPiece suffix);
+bool HasPrefix(Piece s, Piece prefix);
+bool HasSuffix(Piece s, Piece suffix);
 
-StringPiece SkipPrefix(StringPiece s, size_t n);
-StringPiece SkipSuffix(StringPiece s, size_t n);
+Piece SkipPrefix(Piece s, size_t n);
+Piece SkipSuffix(Piece s, size_t n);
 
 // Skip the prefix (or suffix) if it matches with the string.
-StringPiece TrimPrefix(StringPiece s, StringPiece prefix);
-StringPiece TrimSuffix(StringPiece s, StringPiece suffix);
+Piece TrimPrefix(Piece s, Piece prefix);
+Piece TrimSuffix(Piece s, Piece suffix);
 
 // Returns if s contains sub.  Any s except for empty s contains an
 // empty sub.
-bool Contains(StringPiece s, StringPiece sub);
+bool Contains(Piece s, Piece sub);
 
 // Return the first occurrence of sub in s, or npos.  If both s and
 // sub is empty, it returns npos; otherwise, if only sub is empty, it
 // returns 0.
-size_t Index(StringPiece s, StringPiece sub);
+size_t Index(Piece s, Piece sub);
 
 // Return the first occurrence of c in s[pos:end], or npos.
-size_t Find(StringPiece s, char c, size_t pos);
+size_t Find(Piece s, char c, size_t pos);
 
 // Search range is [0..pos] inclusive.  If pos == npos, search everything.
-size_t RFind(StringPiece s, char c, size_t pos);
+size_t RFind(Piece s, char c, size_t pos);
 
-StringPiece SubStr(StringPiece s, size_t pos, size_t n);
+Piece SubStr(Piece s, size_t pos, size_t n);
 
-// allow StringPiece to be logged
-std::ostream& operator<<(std::ostream& o, StringPiece piece);
+// allow Piece to be logged
+std::ostream& operator<<(std::ostream& o, Piece piece);
 
+}  // namespace string
 }  // namespace paddle
diff --git a/paddle/strings/stringpiece_test.cc b/paddle/string/piece_test.cc
similarity index 77%
rename from paddle/strings/stringpiece_test.cc
rename to paddle/string/piece_test.cc
index 2ba66a04f641c3457efa713383484491a213668f..cf5152ff5a3cb0a2afae0c90b787abf291122fa3 100644
--- a/paddle/strings/stringpiece_test.cc
+++ b/paddle/string/piece_test.cc
@@ -14,7 +14,7 @@
   limitations under the License.
 */
 
-#include "paddle/strings/stringpiece.h"
+#include "paddle/string/piece.h"
 
 #include <sstream>
 
@@ -22,42 +22,44 @@
 
 TEST(StringPiece, Construct) {
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(NULL, s.data());
     EXPECT_EQ(0U, s.len());
   }
-  { EXPECT_THROW(paddle::StringPiece s(NULL, 10000U), std::invalid_argument); }
   {
-    paddle::StringPiece s(NULL);
+    EXPECT_THROW(paddle::string::Piece s(NULL, 10000U), std::invalid_argument);
+  }
+  {
+    paddle::string::Piece s(NULL);
     EXPECT_EQ(0U, s.len());
   }
   {
     std::string a;
     EXPECT_EQ(0U, a.size());
-    paddle::StringPiece s(a);
+    paddle::string::Piece s(a);
     EXPECT_EQ(0U, s.len());
   }
 }
 
 TEST(StringPiece, CopyAndAssign) {
-  paddle::StringPiece empty;
+  paddle::string::Piece empty;
   EXPECT_EQ(0U, empty.len());
 
-  paddle::StringPiece a("hello");
-  paddle::StringPiece b = a;
+  paddle::string::Piece a("hello");
+  paddle::string::Piece b = a;
   EXPECT_EQ(b.len(), strlen("hello"));
   EXPECT_EQ(a, b);
 
   std::string storage("hello");
-  paddle::StringPiece c(storage);
+  paddle::string::Piece c(storage);
   EXPECT_EQ(a, c);
   EXPECT_NE(a.data(), c.data());
 }
 
 TEST(StringPiece, Compare) {
   {
-    paddle::StringPiece a("hello");
-    paddle::StringPiece b("world");
+    paddle::string::Piece a("hello");
+    paddle::string::Piece b("world");
     EXPECT_TRUE(a != b);
     EXPECT_FALSE(a == b);
     EXPECT_TRUE(a < b);
@@ -68,7 +70,7 @@ TEST(StringPiece, Compare) {
     EXPECT_GT(Compare(b, a), 0);
   }
   {
-    paddle::StringPiece a, b;
+    paddle::string::Piece a, b;
     EXPECT_TRUE(a == b);
     EXPECT_FALSE(a != b);
     EXPECT_FALSE(a < b);
@@ -82,31 +84,31 @@ TEST(StringPiece, Compare) {
 
 TEST(StringPiece, ToString) {
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(std::string(""), s.ToString());
   }
   {
-    paddle::StringPiece s(NULL);
+    paddle::string::Piece s(NULL);
     EXPECT_EQ(std::string(""), s.ToString());
   }
   {
-    paddle::StringPiece s("hello");
+    paddle::string::Piece s("hello");
     EXPECT_EQ(std::string("hello"), s.ToString());
   }
 }
 
 TEST(StringPiece, HasPrefixSuffix) {
-  using paddle::HasPrefix;
-  using paddle::HasSuffix;
+  using paddle::string::HasPrefix;
+  using paddle::string::HasSuffix;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_FALSE(HasPrefix(s, "something"));
     EXPECT_TRUE(HasPrefix(s, ""));
     EXPECT_FALSE(HasSuffix(s, "something"));
     EXPECT_TRUE(HasSuffix(s, ""));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_TRUE(HasPrefix(s, ""));
     EXPECT_TRUE(HasPrefix(s, "a"));
     EXPECT_TRUE(HasPrefix(s, "ap"));
@@ -120,10 +122,10 @@ TEST(StringPiece, HasPrefixSuffix) {
 }
 
 TEST(StringPiece, SkipPrefixSuffix) {
-  using paddle::SkipPrefix;
-  using paddle::SkipSuffix;
+  using paddle::string::SkipPrefix;
+  using paddle::string::SkipSuffix;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ("", SkipPrefix(s, 0));
     EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
 
@@ -131,7 +133,7 @@ TEST(StringPiece, SkipPrefixSuffix) {
     EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ("app", SkipPrefix(s, 0));
     EXPECT_EQ("pp", SkipPrefix(s, 1));
     EXPECT_EQ("p", SkipPrefix(s, 2));
@@ -147,10 +149,10 @@ TEST(StringPiece, SkipPrefixSuffix) {
 }
 
 TEST(StringPiece, TrimPrefixSuffix) {
-  using paddle::TrimPrefix;
-  using paddle::TrimSuffix;
+  using paddle::string::TrimPrefix;
+  using paddle::string::TrimSuffix;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ("", TrimPrefix(s, ""));
     EXPECT_EQ("", TrimPrefix(s, "something"));
 
@@ -158,7 +160,7 @@ TEST(StringPiece, TrimPrefixSuffix) {
     EXPECT_EQ("", TrimSuffix(s, "something"));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ("app", TrimPrefix(s, ""));
     EXPECT_EQ("pp", TrimPrefix(s, "a"));
     EXPECT_EQ("p", TrimPrefix(s, "ap"));
@@ -174,14 +176,14 @@ TEST(StringPiece, TrimPrefixSuffix) {
 }
 
 TEST(StringPiece, Contains) {
-  using paddle::Contains;
+  using paddle::string::Contains;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_FALSE(Contains(s, ""));
     EXPECT_FALSE(Contains(s, "something"));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_TRUE(Contains(s, ""));
     EXPECT_TRUE(Contains(s, "a"));
     EXPECT_TRUE(Contains(s, "p"));
@@ -193,15 +195,15 @@ TEST(StringPiece, Contains) {
 }
 
 TEST(StringPiece, Index) {
-  using paddle::Index;
-  auto npos = paddle::StringPiece::npos;
+  using paddle::string::Index;
+  auto npos = paddle::string::Piece::npos;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(npos, Index(s, ""));
     EXPECT_EQ(npos, Index(s, "something"));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ(0U, Index(s, ""));
     EXPECT_EQ(0U, Index(s, "a"));
     EXPECT_EQ(1U, Index(s, "p"));
@@ -213,14 +215,14 @@ TEST(StringPiece, Index) {
 }
 
 TEST(StringPiece, Find) {
-  using paddle::Find;
-  auto npos = paddle::StringPiece::npos;
+  using paddle::string::Find;
+  auto npos = paddle::string::Piece::npos;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(npos, Find(s, 'a', 0U));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ(0U, Find(s, 'a', 0U));
     EXPECT_EQ(1U, Find(s, 'p', 0U));
     EXPECT_EQ(1U, Find(s, 'p', 1U));
@@ -230,14 +232,14 @@ TEST(StringPiece, Find) {
 }
 
 TEST(StringPiece, RFind) {
-  using paddle::RFind;
-  auto npos = paddle::StringPiece::npos;
+  using paddle::string::RFind;
+  auto npos = paddle::string::Piece::npos;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ(npos, RFind(s, 'a', 0U));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ(2U, RFind(s, 'p', 2U));
     EXPECT_EQ(0U, RFind(s, 'a', 2U));
     EXPECT_EQ(1U, RFind(s, 'p', 1U));
@@ -247,15 +249,15 @@ TEST(StringPiece, RFind) {
 }
 
 TEST(StringPiece, SubStr) {
-  using paddle::SubStr;
+  using paddle::string::SubStr;
   {
-    paddle::StringPiece s;
+    paddle::string::Piece s;
     EXPECT_EQ("", SubStr(s, 0, 0));
     EXPECT_EQ("", SubStr(s, 0, 1));
     EXPECT_EQ("", SubStr(s, 1, 0));
   }
   {
-    paddle::StringPiece s("app");
+    paddle::string::Piece s("app");
     EXPECT_EQ("", SubStr(s, 0, 0));
     EXPECT_EQ("", SubStr(s, 1, 0));
     EXPECT_EQ("", SubStr(s, 2, 0));
@@ -279,15 +281,15 @@ TEST(StringPiece, SubStr) {
 }
 
 TEST(StringPiece, StreamOutput) {
-  using paddle::StringPiece;
+  using paddle::string::Piece;
 
   std::stringstream o;
-  o << StringPiece();
+  o << paddle::string::Piece();
   EXPECT_EQ("", o.str());
 
-  o << StringPiece("hello");
+  o << paddle::string::Piece("hello");
   EXPECT_EQ("hello", o.str());
 
-  o << StringPiece();
+  o << paddle::string::Piece();
   EXPECT_EQ("hello", o.str());
 }
diff --git a/paddle/string/printf.h b/paddle/string/printf.h
new file mode 100644
index 0000000000000000000000000000000000000000..8b5ce63a8e8dfe77962ff1e7415911d381a28aac
--- /dev/null
+++ b/paddle/string/printf.h
@@ -0,0 +1,99 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+// Compared with std::stringstream, there are primary purpose of
+// string::Printf:
+//
+// 1. Type-safe printing, with why and how explained in
+//    http://www.drdobbs.com/stringprintf-a-typesafe-printf-family-fo/184401999.
+//    Implementation includes
+//
+//    https://github.com/c42f/tinyformat
+//    boost::format
+//    std::stringstream
+//
+//    std::stringstream is not convenient enough in many cases.  For example:
+//
+//      std::cout << std::setprecision(2) << std::fixed << 1.23456 << "\n";
+//
+//    boost::format is the most convenient one.  We can have
+//
+//      std::cout << format("%2% %1%") % 36 % 77;
+//
+//    or
+//
+//      format fmter("%2% %1%");
+//      fmter % 36; fmter % 77;
+//      std::cout << fmter.c_str();
+//
+//    But the overloading of % might be overkilling and it would be
+//    more efficient if it can write to std::cout directly.
+//
+//    tinyformat has an interface compatible with the C-printf style,
+//    and it can writes to a stream or returns a std::string:
+//
+//      std::cout << tfm::printf(
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+//    or
+//
+//      tfm::format(std::cout,
+//                  "%s, %s %d, %.2d:%.2d\n",
+//                  weekday, month, day, hour, min);
+//
+// 2. High-performance -- most printed strings are not too long and
+//    doens't need dynamic memory allocation.  Many StringPrintf
+//    implementations doesn't enforce type-safe, but are
+//    high-performance, including
+//
+//    https://developers.google.com/optimization/reference/base/stringprintf/
+//    https://github.com/adobe/chromium/blob/master/base/stringprintf.h
+//    https://github.com/google/protobuf/blob/master/src/google/protobuf/stubs/stringprintf.h
+//
+// According to
+// https://github.com/c42f/tinyformat#compile-time-and-code-bloat,
+// boost::format runs too slow and results in large executable binary
+// files.  So here we port tinyformat.
+
+#pragma once
+
+#include <iostream>
+#include <sstream>
+#include "paddle/string/tinyformat/tinyformat.h"  // https://github.com/c42f/tinyformat
+
+namespace paddle {
+namespace string {
+
+template <typename... Args>
+void Fprintf(std::ostream& out, const char* fmt, const Args&... args) {
+  tinyformat::vformat(out, fmt, tinyformat::makeFormatList(args...));
+}
+
+template <typename... Args>
+std::string Sprintf(const char* fmt, const Args&... args) {
+  std::ostringstream oss;
+  Fprintf(oss, fmt, args...);
+  return oss.str();
+}
+
+template <typename... Args>
+void Printf(const char* fmt, const Args&... args) {
+  Fprintf(std::cout, fmt, args...);
+}
+
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/string/printf_test.cc b/paddle/string/printf_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8f2454165d741b3937f908dcfd87501940750d5
--- /dev/null
+++ b/paddle/string/printf_test.cc
@@ -0,0 +1,16 @@
+#include "paddle/string/printf.h"
+
+#include <string>
+
+#include "gtest/gtest.h"
+
+TEST(StringPrintf, StringPrintf) {
+  std::string weekday = "Wednesday";
+  const char* month = "July";
+  size_t day = 27;
+  long hour = 14;
+  int min = 44;
+  EXPECT_EQ(std::string("Wednesday, July 27, 14:44"),
+            paddle::string::Sprintf(
+                "%s, %s %d, %.2d:%.2d", weekday, month, day, hour, min));
+}
diff --git a/paddle/string/tinyformat/tinyformat.h b/paddle/string/tinyformat/tinyformat.h
new file mode 100644
index 0000000000000000000000000000000000000000..f0e5e0160fb018b813c1dade727da2861a295147
--- /dev/null
+++ b/paddle/string/tinyformat/tinyformat.h
@@ -0,0 +1,902 @@
+// tinyformat.h
+// Copyright (C) 2011, Chris Foster [chris42f (at) gmail (d0t) com]
+//
+// Boost Software License - Version 1.0
+//
+// Permission is hereby granted, free of charge, to any person or organization
+// obtaining a copy of the software and accompanying documentation covered by
+// this license (the "Software") to use, reproduce, display, distribute,
+// execute, and transmit the Software, and to prepare derivative works of the
+// Software, and to permit third-parties to whom the Software is furnished to
+// do so, all subject to the following:
+//
+// The copyright notices in the Software and this entire statement, including
+// the above license grant, this restriction and the following disclaimer,
+// must be included in all copies of the Software, in whole or in part, and
+// all derivative works of the Software, unless such copies or derivative
+// works are solely in the form of machine-executable object code generated by
+// a source language processor.
+//
+// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+// FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
+// SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
+// FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
+// ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+// DEALINGS IN THE SOFTWARE.
+
+//------------------------------------------------------------------------------
+// Tinyformat: A minimal type safe printf replacement
+//
+// tinyformat.h is a type safe printf replacement library in a single C++
+// header file.  Design goals include:
+//
+// * Type safety and extensibility for user defined types.
+// * C99 printf() compatibility, to the extent possible using std::ostream
+// * Simplicity and minimalism.  A single header file to include and distribute
+//   with your projects.
+// * Augment rather than replace the standard stream formatting mechanism
+// * C++98 support, with optional C++11 niceties
+//
+//
+// Main interface example usage
+// ----------------------------
+//
+// To print a date to std::cout:
+//
+//   std::string weekday = "Wednesday";
+//   const char* month = "July";
+//   size_t day = 27;
+//   long hour = 14;
+//   int min = 44;
+//
+//   tfm::printf("%s, %s %d, %.2d:%.2d\n", weekday, month, day, hour, min);
+//
+// The strange types here emphasize the type safety of the interface; it is
+// possible to print a std::string using the "%s" conversion, and a
+// size_t using the "%d" conversion.  A similar result could be achieved
+// using either of the tfm::format() functions.  One prints on a user provided
+// stream:
+//
+//   tfm::format(std::cerr, "%s, %s %d, %.2d:%.2d\n",
+//               weekday, month, day, hour, min);
+//
+// The other returns a std::string:
+//
+//   std::string date = tfm::format("%s, %s %d, %.2d:%.2d\n",
+//                                  weekday, month, day, hour, min);
+//   std::cout << date;
+//
+// These are the three primary interface functions.  There is also a
+// convenience function printfln() which appends a newline to the usual result
+// of printf() for super simple logging.
+//
+//
+// User defined format functions
+// -----------------------------
+//
+// Simulating variadic templates in C++98 is pretty painful since it requires
+// writing out the same function for each desired number of arguments.  To make
+// this bearable tinyformat comes with a set of macros which are used
+// internally to generate the API, but which may also be used in user code.
+//
+// The three macros TINYFORMAT_ARGTYPES(n), TINYFORMAT_VARARGS(n) and
+// TINYFORMAT_PASSARGS(n) will generate a list of n argument types,
+// type/name pairs and argument names respectively when called with an integer
+// n between 1 and 16.  We can use these to define a macro which generates the
+// desired user defined function with n arguments.  To generate all 16 user
+// defined function bodies, use the macro TINYFORMAT_FOREACH_ARGNUM.  For an
+// example, see the implementation of printf() at the end of the source file.
+//
+// Sometimes it's useful to be able to pass a list of format arguments through
+// to a non-template function.  The FormatList class is provided as a way to do
+// this by storing the argument list in a type-opaque way.  Continuing the
+// example from above, we construct a FormatList using makeFormatList():
+//
+//   FormatListRef formatList = tfm::makeFormatList(weekday, month, day, hour,
+//   min);
+//
+// The format list can now be passed into any non-template function and used
+// via a call to the vformat() function:
+//
+//   tfm::vformat(std::cout, "%s, %s %d, %.2d:%.2d\n", formatList);
+//
+//
+// Additional API information
+// --------------------------
+//
+// Error handling: Define TINYFORMAT_ERROR to customize the error handling for
+// format strings which are unsupported or have the wrong number of format
+// specifiers (calls assert() by default).
+//
+// User defined types: Uses operator<< for user defined types by default.
+// Overload formatValue() for more control.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <sstream>
+
+namespace paddle {
+namespace string {
+namespace tinyformat {
+
+#ifndef TINYFORMAT_ERROR
+#define TINYFORMAT_ERROR(reason) assert(0 && reason)
+#endif
+
+//------------------------------------------------------------------------------
+namespace detail {
+
+// Test whether type T1 is convertible to type T2
+template <typename T1, typename T2>
+struct is_convertible {
+private:
+  // two types of different size
+  struct fail {
+    char dummy[2];
+  };
+  struct succeed {
+    char dummy;
+  };
+  // Try to convert a T1 to a T2 by plugging into tryConvert
+  static fail tryConvert(...);
+  static succeed tryConvert(const T2 &);
+  static const T1 &makeT1();
+
+public:
+  // Standard trick: the (...) version of tryConvert will be chosen from
+  // the overload set only if the version taking a T2 doesn't match.
+  // Then we compare the sizes of the return types to check which
+  // function matched.  Very neat, in a disgusting kind of way :)
+  static const bool value = sizeof(tryConvert(makeT1())) == sizeof(succeed);
+};
+
+// Format the value by casting to type fmtT.  This default implementation
+// should never be called.
+template <typename T,
+          typename fmtT,
+          bool convertible = is_convertible<T, fmtT>::value>
+struct formatValueAsType {
+  static void invoke(std::ostream & /*out*/, const T & /*value*/) { assert(0); }
+};
+// Specialized version for types that can actually be converted to fmtT, as
+// indicated by the "convertible" template parameter.
+template <typename T, typename fmtT>
+struct formatValueAsType<T, fmtT, true> {
+  static void invoke(std::ostream &out, const T &value) {
+    out << static_cast<fmtT>(value);
+  }
+};
+
+// Convert an arbitrary type to integer.  The version with convertible=false
+// throws an error.
+template <typename T, bool convertible = is_convertible<T, int>::value>
+struct convertToInt {
+  static int invoke(const T & /*value*/) {
+    TINYFORMAT_ERROR(
+        "tinyformat: Cannot convert from argument type to "
+        "integer for use as variable width or precision");
+    return 0;
+  }
+};
+// Specialization for convertToInt when conversion is possible
+template <typename T>
+struct convertToInt<T, true> {
+  static int invoke(const T &value) { return static_cast<int>(value); }
+};
+
+// Format at most ntrunc characters to the given stream.
+template <typename T>
+inline void formatTruncated(std::ostream &out, const T &value, int ntrunc) {
+  std::ostringstream tmp;
+  tmp << value;
+  std::string result = tmp.str();
+  out.write(result.c_str(),
+            (std::min)(ntrunc, static_cast<int>(result.size())));
+}
+#define TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(type)                       \
+  inline void formatTruncated(std::ostream &out, type *value, int ntrunc) { \
+    std::streamsize len = 0;                                                \
+    while (len < ntrunc && value[len] != 0) ++len;                          \
+    out.write(value, len);                                                  \
+  }
+// Overload for const char* and char*.  Could overload for signed & unsigned
+// char too, but these are technically unneeded for printf compatibility.
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(const char)
+TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR(char)
+#undef TINYFORMAT_DEFINE_FORMAT_TRUNCATED_CSTR
+
+}  // namespace detail
+
+//------------------------------------------------------------------------------
+// Variable formatting functions.  May be overridden for user-defined types if
+// desired.
+
+/// Format a value into a stream, delegating to operator<< by default.
+///
+/// Users may override this for their own types.  When this function is called,
+/// the stream flags will have been modified according to the format string.
+/// The format specification is provided in the range [fmtBegin, fmtEnd).  For
+/// truncating conversions, ntrunc is set to the desired maximum number of
+/// characters, for example "%.7s" calls formatValue with ntrunc = 7.
+///
+/// By default, formatValue() uses the usual stream insertion operator
+/// operator<< to format the type T, with special cases for the %c and %p
+/// conversions.
+template <typename T>
+inline void formatValue(std::ostream &out,
+                        const char * /*fmtBegin*/,
+                        const char *fmtEnd,
+                        int ntrunc,
+                        const T &value) {
+  // The mess here is to support the %c and %p conversions: if these
+  // conversions are active we try to convert the type to a char or const
+  // void* respectively and format that instead of the value itself.  For the
+  // %p conversion it's important to avoid dereferencing the pointer, which
+  // could otherwise lead to a crash when printing a dangling (const char*).
+  const bool canConvertToChar = detail::is_convertible<T, char>::value;
+  const bool canConvertToVoidPtr =
+      detail::is_convertible<T, const void *>::value;
+  if (canConvertToChar && *(fmtEnd - 1) == 'c')
+    detail::formatValueAsType<T, char>::invoke(out, value);
+  else if (canConvertToVoidPtr && *(fmtEnd - 1) == 'p')
+    detail::formatValueAsType<T, const void *>::invoke(out, value);
+  else if (ntrunc >= 0) {
+    // Take care not to overread C strings in truncating conversions like
+    // "%.4s" where at most 4 characters may be read.
+    detail::formatTruncated(out, value, ntrunc);
+  } else
+    out << value;
+}
+
+// Overloaded version for char types to support printing as an integer
+#define TINYFORMAT_DEFINE_FORMATVALUE_CHAR(charType) \
+  inline void formatValue(std::ostream &out,         \
+                          const char * /*fmtBegin*/, \
+                          const char *fmtEnd,        \
+                          int /**/,                  \
+                          charType value) {          \
+    switch (*(fmtEnd - 1)) {                         \
+      case 'u':                                      \
+      case 'd':                                      \
+      case 'i':                                      \
+      case 'o':                                      \
+      case 'X':                                      \
+      case 'x':                                      \
+        out << static_cast<int>(value);              \
+        break;                                       \
+      default:                                       \
+        out << value;                                \
+        break;                                       \
+    }                                                \
+  }
+// per 3.9.1: char, signed char and unsigned char are all distinct types
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(signed char)
+TINYFORMAT_DEFINE_FORMATVALUE_CHAR(unsigned char)
+#undef TINYFORMAT_DEFINE_FORMATVALUE_CHAR
+
+//------------------------------------------------------------------------------
+// Tools for emulating variadic templates in C++98.  The basic idea here is
+// stolen from the boost preprocessor metaprogramming library and cut down to
+// be just general enough for what we need.
+
+#define TINYFORMAT_ARGTYPES(n) TINYFORMAT_ARGTYPES_##n
+#define TINYFORMAT_VARARGS(n) TINYFORMAT_VARARGS_##n
+#define TINYFORMAT_PASSARGS(n) TINYFORMAT_PASSARGS_##n
+#define TINYFORMAT_PASSARGS_TAIL(n) TINYFORMAT_PASSARGS_TAIL_##n
+
+// To keep it as transparent as possible, the macros below have been generated
+// using python via the excellent cog.py code generation script.  This avoids
+// the need for a bunch of complex (but more general) preprocessor tricks as
+// used in boost.preprocessor.
+//
+// To rerun the code generation in place, use `cog.py -r tinyformat.h`
+// (see http://nedbatchelder.com/code/cog).  Alternatively you can just create
+// extra versions by hand.
+
+/*[[[cog
+maxParams = 16
+
+def makeCommaSepLists(lineTemplate, elemTemplate, startInd=1):
+    for j in range(startInd,maxParams+1):
+        list = ', '.join([elemTemplate % {'i':i} for i in range(startInd,j+1)])
+        cog.outl(lineTemplate % {'j':j, 'list':list})
+
+makeCommaSepLists('#define TINYFORMAT_ARGTYPES_%(j)d %(list)s',
+                  'class T%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_VARARGS_%(j)d %(list)s',
+                  'const T%(i)d& v%(i)d')
+
+cog.outl()
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_%(j)d %(list)s', 'v%(i)d')
+
+cog.outl()
+cog.outl('#define TINYFORMAT_PASSARGS_TAIL_1')
+makeCommaSepLists('#define TINYFORMAT_PASSARGS_TAIL_%(j)d , %(list)s',
+                  'v%(i)d', startInd = 2)
+
+cog.outl()
+cog.outl('#define TINYFORMAT_FOREACH_ARGNUM(m) \\\n    ' +
+         ' '.join(['m(%d)' % (j,) for j in range(1,maxParams+1)]))
+]]]*/
+#define TINYFORMAT_ARGTYPES_1 class T1
+#define TINYFORMAT_ARGTYPES_2 class T1, class T2
+#define TINYFORMAT_ARGTYPES_3 class T1, class T2, class T3
+#define TINYFORMAT_ARGTYPES_4 class T1, class T2, class T3, class T4
+#define TINYFORMAT_ARGTYPES_5 class T1, class T2, class T3, class T4, class T5
+#define TINYFORMAT_ARGTYPES_6 \
+  class T1, class T2, class T3, class T4, class T5, class T6
+#define TINYFORMAT_ARGTYPES_7 \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7
+#define TINYFORMAT_ARGTYPES_8 \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8
+#define TINYFORMAT_ARGTYPES_9                                           \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9
+#define TINYFORMAT_ARGTYPES_10                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10
+#define TINYFORMAT_ARGTYPES_11                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11
+#define TINYFORMAT_ARGTYPES_12                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12
+#define TINYFORMAT_ARGTYPES_13                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13
+#define TINYFORMAT_ARGTYPES_14                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14
+#define TINYFORMAT_ARGTYPES_15                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14, class T15
+#define TINYFORMAT_ARGTYPES_16                                          \
+  class T1, class T2, class T3, class T4, class T5, class T6, class T7, \
+      class T8, class T9, class T10, class T11, class T12, class T13,   \
+      class T14, class T15, class T16
+
+#define TINYFORMAT_VARARGS_1 const T1 &v1
+#define TINYFORMAT_VARARGS_2 const T1 &v1, const T2 &v2
+#define TINYFORMAT_VARARGS_3 const T1 &v1, const T2 &v2, const T3 &v3
+#define TINYFORMAT_VARARGS_4 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4
+#define TINYFORMAT_VARARGS_5 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5
+#define TINYFORMAT_VARARGS_6                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6
+#define TINYFORMAT_VARARGS_7                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7
+#define TINYFORMAT_VARARGS_8                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8
+#define TINYFORMAT_VARARGS_9                                            \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9
+#define TINYFORMAT_VARARGS_10                                           \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5, \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10
+#define TINYFORMAT_VARARGS_11                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11
+#define TINYFORMAT_VARARGS_12                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12
+#define TINYFORMAT_VARARGS_13                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13
+#define TINYFORMAT_VARARGS_14                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14
+#define TINYFORMAT_VARARGS_15                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
+      const T15 &v15
+#define TINYFORMAT_VARARGS_16                                                 \
+  const T1 &v1, const T2 &v2, const T3 &v3, const T4 &v4, const T5 &v5,       \
+      const T6 &v6, const T7 &v7, const T8 &v8, const T9 &v9, const T10 &v10, \
+      const T11 &v11, const T12 &v12, const T13 &v13, const T14 &v14,         \
+      const T15 &v15, const T16 &v16
+
+#define TINYFORMAT_PASSARGS_1 v1
+#define TINYFORMAT_PASSARGS_2 v1, v2
+#define TINYFORMAT_PASSARGS_3 v1, v2, v3
+#define TINYFORMAT_PASSARGS_4 v1, v2, v3, v4
+#define TINYFORMAT_PASSARGS_5 v1, v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_6 v1, v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_7 v1, v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_8 v1, v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_9 v1, v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_10 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_11 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_12 v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_13 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_14 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_15 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_16 \
+  v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_PASSARGS_TAIL_1
+#define TINYFORMAT_PASSARGS_TAIL_2 , v2
+#define TINYFORMAT_PASSARGS_TAIL_3 , v2, v3
+#define TINYFORMAT_PASSARGS_TAIL_4 , v2, v3, v4
+#define TINYFORMAT_PASSARGS_TAIL_5 , v2, v3, v4, v5
+#define TINYFORMAT_PASSARGS_TAIL_6 , v2, v3, v4, v5, v6
+#define TINYFORMAT_PASSARGS_TAIL_7 , v2, v3, v4, v5, v6, v7
+#define TINYFORMAT_PASSARGS_TAIL_8 , v2, v3, v4, v5, v6, v7, v8
+#define TINYFORMAT_PASSARGS_TAIL_9 , v2, v3, v4, v5, v6, v7, v8, v9
+#define TINYFORMAT_PASSARGS_TAIL_10 , v2, v3, v4, v5, v6, v7, v8, v9, v10
+#define TINYFORMAT_PASSARGS_TAIL_11 , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11
+#define TINYFORMAT_PASSARGS_TAIL_12 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12
+#define TINYFORMAT_PASSARGS_TAIL_13 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13
+#define TINYFORMAT_PASSARGS_TAIL_14 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14
+#define TINYFORMAT_PASSARGS_TAIL_15 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
+#define TINYFORMAT_PASSARGS_TAIL_16 \
+  , v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16
+
+#define TINYFORMAT_FOREACH_ARGNUM(m)                                         \
+  m(1) m(2) m(3) m(4) m(5) m(6) m(7) m(8) m(9) m(10) m(11) m(12) m(13) m(14) \
+      m(15) m(16)
+//[[[end]]]
+
+namespace detail {
+
+// Type-opaque holder for an argument to format(), with associated actions on
+// the type held as explicit function pointers.  This allows FormatArg's for
+// each argument to be allocated as a homogenous array inside FormatList
+// whereas a naive implementation based on inheritance does not.
+class FormatArg {
+public:
+  FormatArg() {}
+
+  template <typename T>
+  FormatArg(const T &value)
+      : m_value(static_cast<const void *>(&value)),
+        m_formatImpl(&formatImpl<T>),
+        m_toIntImpl(&toIntImpl<T>) {}
+
+  void format(std::ostream &out,
+              const char *fmtBegin,
+              const char *fmtEnd,
+              int ntrunc) const {
+    m_formatImpl(out, fmtBegin, fmtEnd, ntrunc, m_value);
+  }
+
+  int toInt() const { return m_toIntImpl(m_value); }
+
+private:
+  template <typename T>
+  static void formatImpl(std::ostream &out,
+                         const char *fmtBegin,
+                         const char *fmtEnd,
+                         int ntrunc,
+                         const void *value) {
+    formatValue(out, fmtBegin, fmtEnd, ntrunc, *static_cast<const T *>(value));
+  }
+
+  template <typename T>
+  static int toIntImpl(const void *value) {
+    return convertToInt<T>::invoke(*static_cast<const T *>(value));
+  }
+
+  const void *m_value;
+  void (*m_formatImpl)(std::ostream &out,
+                       const char *fmtBegin,
+                       const char *fmtEnd,
+                       int ntrunc,
+                       const void *value);
+  int (*m_toIntImpl)(const void *value);
+};
+
+// Parse and return an integer from the string c, as atoi()
+// On return, c is set to one past the end of the integer.
+inline int parseIntAndAdvance(const char *&c) {
+  int i = 0;
+  for (; *c >= '0' && *c <= '9'; ++c) i = 10 * i + (*c - '0');
+  return i;
+}
+
+// Print literal part of format string and return next format spec
+// position.
+//
+// Skips over any occurrences of '%%', printing a literal '%' to the
+// output.  The position of the first % character of the next
+// nontrivial format spec is returned, or the end of string.
+inline const char *printFormatStringLiteral(std::ostream &out,
+                                            const char *fmt) {
+  const char *c = fmt;
+  for (;; ++c) {
+    switch (*c) {
+      case '\0':
+        out.write(fmt, c - fmt);
+        return c;
+      case '%':
+        out.write(fmt, c - fmt);
+        if (*(c + 1) != '%') return c;
+        // for "%%", tack trailing % onto next literal section.
+        fmt = ++c;
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+// Parse a format string and set the stream state accordingly.
+//
+// The format mini-language recognized here is meant to be the one from C99,
+// with the form "%[flags][width][.precision][length]type".
+//
+// Formatting options which can't be natively represented using the ostream
+// state are returned in spacePadPositive (for space padded positive numbers)
+// and ntrunc (for truncating conversions).  argIndex is incremented if
+// necessary to pull out variable width and precision .  The function returns a
+// pointer to the character after the end of the current format spec.
+inline const char *streamStateFromFormat(std::ostream &out,
+                                         bool &spacePadPositive,
+                                         int &ntrunc,
+                                         const char *fmtStart,
+                                         const detail::FormatArg *formatters,
+                                         int &argIndex,
+                                         int numFormatters) {
+  if (*fmtStart != '%') {
+    TINYFORMAT_ERROR(
+        "tinyformat: Not enough conversion specifiers in format string");
+    return fmtStart;
+  }
+  // Reset stream state to defaults.
+  out.width(0);
+  out.precision(6);
+  out.fill(' ');
+  // Reset most flags; ignore irrelevant unitbuf & skipws.
+  out.unsetf(std::ios::adjustfield | std::ios::basefield |
+             std::ios::floatfield | std::ios::showbase | std::ios::boolalpha |
+             std::ios::showpoint | std::ios::showpos | std::ios::uppercase);
+  bool precisionSet = false;
+  bool widthSet = false;
+  int widthExtra = 0;
+  const char *c = fmtStart + 1;
+  // 1) Parse flags
+  for (;; ++c) {
+    switch (*c) {
+      case '#':
+        out.setf(std::ios::showpoint | std::ios::showbase);
+        continue;
+      case '0':
+        // overridden by left alignment ('-' flag)
+        if (!(out.flags() & std::ios::left)) {
+          // Use internal padding so that numeric values are
+          // formatted correctly, eg -00010 rather than 000-10
+          out.fill('0');
+          out.setf(std::ios::internal, std::ios::adjustfield);
+        }
+        continue;
+      case '-':
+        out.fill(' ');
+        out.setf(std::ios::left, std::ios::adjustfield);
+        continue;
+      case ' ':
+        // overridden by show positive sign, '+' flag.
+        if (!(out.flags() & std::ios::showpos)) spacePadPositive = true;
+        continue;
+      case '+':
+        out.setf(std::ios::showpos);
+        spacePadPositive = false;
+        widthExtra = 1;
+        continue;
+      default:
+        break;
+    }
+    break;
+  }
+  // 2) Parse width
+  if (*c >= '0' && *c <= '9') {
+    widthSet = true;
+    out.width(parseIntAndAdvance(c));
+  }
+  if (*c == '*') {
+    widthSet = true;
+    int width = 0;
+    if (argIndex < numFormatters)
+      width = formatters[argIndex++].toInt();
+    else
+      TINYFORMAT_ERROR(
+          "tinyformat: Not enough arguments to read variable width");
+    if (width < 0) {
+      // negative widths correspond to '-' flag set
+      out.fill(' ');
+      out.setf(std::ios::left, std::ios::adjustfield);
+      width = -width;
+    }
+    out.width(width);
+    ++c;
+  }
+  // 3) Parse precision
+  if (*c == '.') {
+    ++c;
+    int precision = 0;
+    if (*c == '*') {
+      ++c;
+      if (argIndex < numFormatters)
+        precision = formatters[argIndex++].toInt();
+      else
+        TINYFORMAT_ERROR(
+            "tinyformat: Not enough arguments to read variable precision");
+    } else {
+      if (*c >= '0' && *c <= '9')
+        precision = parseIntAndAdvance(c);
+      else if (*c == '-')  // negative precisions ignored, treated as zero.
+        parseIntAndAdvance(++c);
+    }
+    out.precision(precision);
+    precisionSet = true;
+  }
+  // 4) Ignore any C99 length modifier
+  while (*c == 'l' || *c == 'h' || *c == 'L' || *c == 'j' || *c == 'z' ||
+         *c == 't')
+    ++c;
+  // 5) We're up to the conversion specifier character.
+  // Set stream flags based on conversion specifier (thanks to the
+  // boost::format class for forging the way here).
+  bool intConversion = false;
+  switch (*c) {
+    case 'u':
+    case 'd':
+    case 'i':
+      out.setf(std::ios::dec, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'o':
+      out.setf(std::ios::oct, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'X':
+      out.setf(std::ios::uppercase);
+    case 'x':
+    case 'p':
+      out.setf(std::ios::hex, std::ios::basefield);
+      intConversion = true;
+      break;
+    case 'E':
+      out.setf(std::ios::uppercase);
+    case 'e':
+      out.setf(std::ios::scientific, std::ios::floatfield);
+      out.setf(std::ios::dec, std::ios::basefield);
+      break;
+    case 'F':
+      out.setf(std::ios::uppercase);
+    case 'f':
+      out.setf(std::ios::fixed, std::ios::floatfield);
+      break;
+    case 'G':
+      out.setf(std::ios::uppercase);
+    case 'g':
+      out.setf(std::ios::dec, std::ios::basefield);
+      // As in boost::format, let stream decide float format.
+      out.flags(out.flags() & ~std::ios::floatfield);
+      break;
+    case 'a':
+    case 'A':
+      TINYFORMAT_ERROR(
+          "tinyformat: the %a and %A conversion specs "
+          "are not supported");
+      break;
+    case 'c':
+      // Handled as special case inside formatValue()
+      break;
+    case 's':
+      if (precisionSet) ntrunc = static_cast<int>(out.precision());
+      // Make %s print booleans as "true" and "false"
+      out.setf(std::ios::boolalpha);
+      break;
+    case 'n':
+      // Not supported - will cause problems!
+      TINYFORMAT_ERROR("tinyformat: %n conversion spec not supported");
+      break;
+    case '\0':
+      TINYFORMAT_ERROR(
+          "tinyformat: Conversion spec incorrectly "
+          "terminated by end of string");
+      return c;
+    default:
+      break;
+  }
+  if (intConversion && precisionSet && !widthSet) {
+    // "precision" for integers gives the minimum number of digits (to be
+    // padded with zeros on the left).  This isn't really supported by the
+    // iostreams, but we can approximately simulate it with the width if
+    // the width isn't otherwise used.
+    out.width(out.precision() + widthExtra);
+    out.setf(std::ios::internal, std::ios::adjustfield);
+    out.fill('0');
+  }
+  return c + 1;
+}
+
+//------------------------------------------------------------------------------
+inline void formatImpl(std::ostream &out,
+                       const char *fmt,
+                       const detail::FormatArg *formatters,
+                       int numFormatters) {
+  // Saved stream state
+  std::streamsize origWidth = out.width();
+  std::streamsize origPrecision = out.precision();
+  std::ios::fmtflags origFlags = out.flags();
+  char origFill = out.fill();
+
+  for (int argIndex = 0; argIndex < numFormatters; ++argIndex) {
+    // Parse the format string
+    fmt = printFormatStringLiteral(out, fmt);
+    bool spacePadPositive = false;
+    int ntrunc = -1;
+    const char *fmtEnd = streamStateFromFormat(out,
+                                               spacePadPositive,
+                                               ntrunc,
+                                               fmt,
+                                               formatters,
+                                               argIndex,
+                                               numFormatters);
+    if (argIndex >= numFormatters) {
+      // Check args remain after reading any variable width/precision
+      TINYFORMAT_ERROR("tinyformat: Not enough format arguments");
+      return;
+    }
+    const FormatArg &arg = formatters[argIndex];
+    // Format the arg into the stream.
+    if (!spacePadPositive)
+      arg.format(out, fmt, fmtEnd, ntrunc);
+    else {
+      // The following is a special case with no direct correspondence
+      // between stream formatting and the printf() behaviour.  Simulate
+      // it crudely by formatting into a temporary string stream and
+      // munging the resulting string.
+      std::ostringstream tmpStream;
+      tmpStream.copyfmt(out);
+      tmpStream.setf(std::ios::showpos);
+      arg.format(tmpStream, fmt, fmtEnd, ntrunc);
+      std::string result = tmpStream.str();  // allocates... yuck.
+      for (size_t i = 0, iend = result.size(); i < iend; ++i)
+        if (result[i] == '+') result[i] = ' ';
+      out << result;
+    }
+    fmt = fmtEnd;
+  }
+
+  // Print remaining part of format string.
+  fmt = printFormatStringLiteral(out, fmt);
+  if (*fmt != '\0')
+    TINYFORMAT_ERROR(
+        "tinyformat: Too many conversion specifiers in format string");
+
+  // Restore stream state
+  out.width(origWidth);
+  out.precision(origPrecision);
+  out.flags(origFlags);
+  out.fill(origFill);
+}
+
+}  // namespace detail
+
+/// List of template arguments format(), held in a type-opaque way.
+///
+/// A const reference to FormatList (typedef'd as FormatListRef) may be
+/// conveniently used to pass arguments to non-template functions: All type
+/// information has been stripped from the arguments, leaving just enough of a
+/// common interface to perform formatting as required.
+class FormatList {
+public:
+  FormatList(detail::FormatArg *formatters, int N)
+      : m_formatters(formatters), m_N(N) {}
+
+  friend void vformat(std::ostream &out,
+                      const char *fmt,
+                      const FormatList &list);
+
+private:
+  const detail::FormatArg *m_formatters;
+  int m_N;
+};
+
+/// Reference to type-opaque format list for passing to vformat()
+typedef const FormatList &FormatListRef;
+
+namespace detail {
+
+// Format list subclass with fixed storage to avoid dynamic allocation
+template <int N>
+class FormatListN : public FormatList {
+public:
+  template <typename... Args>
+  FormatListN(const Args &... args)
+      : FormatList(&m_formatterStore[0], N),
+        m_formatterStore{FormatArg(args)...} {
+    static_assert(sizeof...(args) == N, "Number of args must be N");
+  }
+
+private:
+  FormatArg m_formatterStore[N];
+};
+
+// Special 0-arg version - MSVC says zero-sized C array in struct is nonstandard
+template <>
+class FormatListN<0> : public FormatList {
+public:
+  FormatListN() : FormatList(0, 0) {}
+};
+
+}  // namespace detail
+
+//------------------------------------------------------------------------------
+// Primary API functions
+
+/// Make type-agnostic format list from list of template arguments.
+///
+/// The exact return type of this function is an implementation detail and
+/// shouldn't be relied upon.  Instead it should be stored as a FormatListRef:
+///
+///   FormatListRef formatList = makeFormatList( /*...*/ );
+template <typename... Args>
+detail::FormatListN<sizeof...(Args)> makeFormatList(const Args &... args) {
+  return detail::FormatListN<sizeof...(args)>(args...);
+}
+
+/// Format list of arguments to the stream according to the given format string.
+///
+/// The name vformat() is chosen for the semantic similarity to vprintf(): the
+/// list of format arguments is held in a single function argument.
+inline void vformat(std::ostream &out, const char *fmt, FormatListRef list) {
+  detail::formatImpl(out, fmt, list.m_formatters, list.m_N);
+}
+
+/// Format list of arguments to the stream according to given format string.
+template <typename... Args>
+void format(std::ostream &out, const char *fmt, const Args &... args) {
+  vformat(out, fmt, makeFormatList(args...));
+}
+
+/// Format list of arguments according to the given format string and return
+/// the result as a string.
+template <typename... Args>
+std::string format(const char *fmt, const Args &... args) {
+  std::ostringstream oss;
+  format(oss, fmt, args...);
+  return oss.str();
+}
+
+/// Format list of arguments to std::cout, according to the given format string
+template <typename... Args>
+void printf(const char *fmt, const Args &... args) {
+  format(std::cout, fmt, args...);
+}
+
+template <typename... Args>
+void printfln(const char *fmt, const Args &... args) {
+  format(std::cout, fmt, args...);
+  std::cout << '\n';
+}
+
+}  // namespace tinyformat
+}  // namespace string
+}  // namespace paddle
diff --git a/paddle/strings/CMakeLists.txt b/paddle/strings/CMakeLists.txt
deleted file mode 100644
index 4e55eecd484c0e218ecd51bbd19b3eb4f6f92a25..0000000000000000000000000000000000000000
--- a/paddle/strings/CMakeLists.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-cc_library(stringpiece SRCS stringpiece.cc)
-cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags)
diff --git a/paddle/strings/stringpiece.cc b/paddle/strings/stringpiece.cc
deleted file mode 100644
index 415b3558d5dfffde26275bcb16ea3922424ca9f3..0000000000000000000000000000000000000000
--- a/paddle/strings/stringpiece.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-/*
-  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-  Licensed under the Apache License, Version 2.0 (the "License");
-  you may not use this file except in compliance with the License.
-  You may obtain a copy of the License at
-
-  http://www.apache.org/licenses/LICENSE-2.0
-
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
-*/
-
-#include "paddle/strings/stringpiece.h"
-
-#include <string.h>
-
-#include <algorithm>
-#include <iosfwd>
-#include <stdexcept>
-
-namespace paddle {
-
-StringPiece::StringPiece() : data_(NULL), size_(0) {}
-
-StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) {
-  if (d == NULL && n != 0)
-    throw std::invalid_argument(
-        "StringPiece requires len to be 0 for NULL data");
-}
-
-StringPiece::StringPiece(const char* s) : data_(s) {
-  size_ = (s == NULL) ? 0 : strlen(s);
-}
-
-StringPiece::StringPiece(const std::string& s)
-    : data_(s.data()), size_(s.size()) {}
-
-char StringPiece::operator[](size_t n) const {
-  if (n >= len())
-    throw std::invalid_argument("index out of StringPiece length");
-  return data_[n];
-}
-
-int Compare(StringPiece a, StringPiece b) {
-  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
-  int r = memcmp(a.data(), b.data(), min_len);
-  if (r == 0) {
-    if (a.len() < b.len())
-      return -1;
-    else if (a.len() > b.len())
-      return 1;
-  }
-  return r;
-}
-
-bool operator==(StringPiece x, StringPiece y) {
-  return ((x.len() == y.len()) &&
-          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
-}
-
-bool operator!=(StringPiece x, StringPiece y) { return !(x == y); }
-
-bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; }
-bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; }
-
-bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; }
-bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; }
-
-bool HasPrefix(StringPiece s, StringPiece x) {
-  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
-}
-
-bool HasSuffix(StringPiece s, StringPiece x) {
-  return ((s.len() >= x.len()) &&
-          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
-}
-
-StringPiece SkipPrefix(StringPiece s, size_t n) {
-  if (n > s.len())
-    throw std::invalid_argument("Skip distance larger than StringPiece length");
-  return StringPiece(s.data() + n, s.len() - n);
-}
-
-StringPiece SkipSuffix(StringPiece s, size_t n) {
-  if (n > s.len())
-    throw std::invalid_argument("Skip distance larger than StringPiece length");
-  return StringPiece(s.data(), s.len() - n);
-}
-
-StringPiece TrimPrefix(StringPiece s, StringPiece x) {
-  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
-}
-
-StringPiece TrimSuffix(StringPiece s, StringPiece x) {
-  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
-}
-
-bool Contains(StringPiece s, StringPiece sub) {
-  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
-}
-
-size_t Index(StringPiece s, StringPiece sub) {
-  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
-  return e != s.end() ? e - s.data() : StringPiece::npos;
-}
-
-size_t Find(StringPiece s, char c, size_t pos) {
-  if (pos >= s.len()) {
-    return StringPiece::npos;
-  }
-  const char* result =
-      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
-  return result != nullptr ? result - s.data() : StringPiece::npos;
-}
-
-size_t RFind(StringPiece s, char c, size_t pos) {
-  if (s.len() == 0) return StringPiece::npos;
-  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
-       p--) {
-    if (*p == c) {
-      return p - s.data();
-    }
-  }
-  return StringPiece::npos;
-}
-
-StringPiece SubStr(StringPiece s, size_t pos, size_t n) {
-  if (pos > s.len()) pos = s.len();
-  if (n > s.len() - pos) n = s.len() - pos;
-  return StringPiece(s.data() + pos, n);
-}
-
-std::ostream& operator<<(std::ostream& o, StringPiece piece) {
-  return o << piece.ToString();
-}
-
-}  // namespace paddle
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index c47add04b081cbdf78b5a5d3bca3a71025b3d9ac..4245df5ab72bf0fd67261818b307f0babdb5d685 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -2,7 +2,7 @@
 
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
-  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
-  add_dependencies(paddle_test_util gen_proto_cpp)
+  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
 endif()
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index f34d53ae99f913a8aed8767b7271a538efce4778..6414c399561575c13074c41598184a78f84373ee 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -41,7 +41,8 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
     ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    paddle_proto
+    ${external_project_dependencies})
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
diff --git a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
index d669fbc40cbc19df309d8bf20c942a9d8fc8f47d..741a0aa71df7866c180ab2513f28638117d0f1ca 100644
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
@@ -35,7 +35,7 @@ def outer_step(dummy_data):
                                  embedding_size=num_words)]
 
     def inner_step(dummy_memory, predict_word):
-        
+
         # simplified RNN for testing
         with mixed_layer(size=num_words) as layer:
             layer += full_matrix_projection(input=predict_word,
@@ -46,15 +46,15 @@ def outer_step(dummy_data):
                                                 param_attr=ParamAttr(name="wordvec"))
 
         return out
-    
+
     beam_gen = beam_search(name="rnn_gen",
                            step=inner_step,
                            input=gen_inputs,
                            bos_id=0,
                            eos_id=num_words-1,
                            beam_size=2 if beam_flag else 1,
-                           num_results_per_sample=2 if beam_flag else 1,
-                           max_length=10) 
+                           num_results_per_sample=1,
+                           max_length=10)
     return beam_gen
 
 beam_gen_concat = recurrent_group(name="rnn_gen_concat",
diff --git a/paddle/trainer/tests/sample_trainer_rnn_gen.conf b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
index 2b337282f6285afb527e9bbf138d2e8184700d8d..58d27f15ae1c0a38885ee105a7963b6e7bd55906 100644
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
@@ -33,7 +33,7 @@ gen_inputs = [StaticInput(input=dummy_data, size=2),
                              embedding_size=num_words)]
 
 def step(dummy_memory, predict_word):
-    
+
     # simplified RNN for testing
     with mixed_layer(size=num_words) as layer:
         layer += full_matrix_projection(input=predict_word,
@@ -44,7 +44,7 @@ def step(dummy_memory, predict_word):
                                             param_attr=ParamAttr(name="wordvec"))
 
     return out
-    
+
 beam_gen = beam_search(name="rnn_gen",
                        step=step,
                        input=gen_inputs,
@@ -52,7 +52,7 @@ beam_gen = beam_search(name="rnn_gen",
                        eos_id=num_words-1,
                        beam_size=2 if beam_flag else 1,
                        num_results_per_sample=2 if beam_flag else 1,
-                       max_length=10) 
+                       max_length=10)
 
 seqtext_printer_evaluator(input=beam_gen,
                           id_input=sent_id,
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index af59951752d1799c95e293d3eae233e6aa26e5f3..7a4977935ede4878c07f4fb6ba0dd76bf50acd42 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -17,7 +17,7 @@ add_library(paddle_utils STATIC
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
     ${UTIL_ARCH_SOURCES})
-add_dependencies(paddle_utils gen_proto_cpp)
+add_dependencies(paddle_utils paddle_proto ${external_project_dependencies})
 if(WITH_TESTING)
     add_subdirectory(tests)
 endif()
diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
deleted file mode 100644
index cebca5a2a3766110b83231eb0705e48800a7bda6..0000000000000000000000000000000000000000
--- a/paddle/utils/Compiler.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-/**
- * This header defines some useful attribute by each compiler. It is the
- * abstract layer of compilers.
- */
-#ifdef __GNUC__
-#define GCC_VERSION \
-  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#else
-#define GCC_VERSION
-#endif
-
-/**
- * __must_check macro. It make the function's return value must be used,
- * otherwise it will raise a compile warning. And also Paddle treat all compile
- * warnings as errors.
- */
-#if GCC_VERSION >= 30400
-#define __must_check __attribute__((warn_unused_result))
-#else
-#define __must_check
-#endif
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 6992e856223494d6575ef3261d82cbdf4e375885..52a6df94979fd3d8d7d540ed0e3898bb3375d975 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -55,13 +55,17 @@ public:
    *        Else, just set status to popping.
    */
   void pop(const T& item) {
-    pushing() = false;
     auto& s = this->stack();
     if (item == s.top()) {
       s.pop();
     }
   }
 
+  /**
+   * @brief Indicate whether we are at forward or backward stage of computation
+   */
+  void set_stage(bool isForward) { pushing() = isForward; }
+
   /**
    * @brief clear current thread stack.
    */
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index cda1b5c37dada8d0c6c77fc2fb03bb614d5301b5..27ddaab3f003110a2684a871a2de17afb473d660 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -19,7 +19,21 @@ limitations under the License. */
 #include <stdio.h>
 #include <memory>
 #include <string>
-#include "Compiler.h"
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#ifdef __GNUC__
+#if (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
+#else
+#define __must_check
+#endif
 
 namespace paddle {
 
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index b5d9f93f1376048eabd726331006b0bb848bce11..c320074fbadab3e211ed72ce715d595c90673d6d 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -72,7 +72,6 @@ TEST(CustomStackTrace, normalTrain) {
       for (size_t i = 0; i < layerSize; ++i) {
         tracer.push("layer_" + paddle::str::to_string(i));
       }
-      tracer.pop("");
       for (size_t i = 0; i < layerSize; ++i) {
         tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
       }
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index c942620990765832f21c887d30f85a2d211a5f32..18584cafe7971bad281b498908c54780250791b7 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -1,43 +1,23 @@
-set(proto_filenames
-    DataConfig.proto
-    DataFormat.proto
-    ModelConfig.proto
-    ParameterConfig.proto
-    ParameterService.proto
-    TrainerConfig.proto
-    OptimizerConfig.proto
-    ParameterServerConfig.proto)
+file(GLOB proto_filenames . *.proto)
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
+proto_library(paddle_proto SRCS ${proto_filenames})
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
 
 foreach(filename ${proto_filenames})
-    get_filename_component(base_filename ${filename} NAME_WE)
-    set(CUR_PROTO_GEN
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.h
-        ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.pb.cc)
-    set(PROTO_GEN
-        ${PROTO_GEN}
-        ${CUR_PROTO_GEN})
-    add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
-                  --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
-
+    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
+    get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-        ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
+            ${PROJ_ROOT}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
-        ${CUR_PROTO_GEN_PY}
-        ${PROTO_GEN_PY})
+            ${CUR_PROTO_GEN_PY}
+            ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename} ${external_project_dependencies})
+            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
+            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
+            DEPENDS ${ABS_FIL} ${external_project_dependencies})
 endforeach()
 
-add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
-
-add_library(paddle_proto STATIC ${PROTO_GEN})
-target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 29270829bbc3af6990aaf03a5228ef7f6a892a5c..ebe4f5cbb569ff37a46eb44de6362a7df337fe38 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -489,6 +489,15 @@ message EvaluatorConfig {
   // Used by ClassificationErrorEvaluator
   // top # classification error
   optional int32 top_k = 13 [default = 1];
+
+  // Used by DetectionMAPEvaluator
+  optional double overlap_threshold = 14 [default = 0.5];
+
+  optional int32 background_id = 15 [default = 0];
+
+  optional bool evaluate_difficult = 16 [default = false];
+
+  optional string ap_type = 17 [default = "11point"];
 }
 
 message LinkConfig {
diff --git a/proto/ParameterConfig.proto b/proto/ParameterConfig.proto
index cbcd0af598df22c36c66767fdeb7add2aa49e87d..580d66324602df4c655dd2f1e1cd87159b5b346b 100644
--- a/proto/ParameterConfig.proto
+++ b/proto/ParameterConfig.proto
@@ -25,8 +25,10 @@ enum ParameterInitStrategy {
 }
 
 message ParameterUpdaterHookConfig {
+  // hook type such as  'pruning'
   required string type = 1;
-  optional string purning_mask_filename = 2;
+  // this represents the ratio of zero element to be set by the Parameter 
+  optional double sparsity_ratio = 2 [default = 0.6];
 }
 
 message ParameterConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c11dc09a8b98bb8a3c8455f811b1435714e825d0..58e4902f57aa8018b820f48f6cbf659f1e5f5183 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1280,20 +1280,23 @@ def parse_maxout(maxout, input_layer_name, maxout_conf):
 
 # Define an evaluator
 @config_func
-def Evaluator(
-        name,
-        type,
-        inputs,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        top_k=None,
-        delimited=None,
-        excluded_chunk_types=None, ):
+def Evaluator(name,
+              type,
+              inputs,
+              chunk_scheme=None,
+              num_chunk_types=None,
+              classification_threshold=None,
+              positive_label=None,
+              dict_file=None,
+              result_file=None,
+              num_results=None,
+              top_k=None,
+              delimited=None,
+              excluded_chunk_types=None,
+              overlap_threshold=None,
+              background_id=None,
+              evaluate_difficult=None,
+              ap_type=None):
     evaluator = g_config.model_config.evaluators.add()
     evaluator.type = type
     evaluator.name = MakeLayerNameInSubmodel(name)
@@ -1327,6 +1330,18 @@ def Evaluator(
     if excluded_chunk_types:
         evaluator.excluded_chunk_types.extend(excluded_chunk_types)
 
+    if overlap_threshold is not None:
+        evaluator.overlap_threshold = overlap_threshold
+
+    if background_id is not None:
+        evaluator.background_id = background_id
+
+    if evaluate_difficult is not None:
+        evaluator.evaluate_difficult = evaluate_difficult
+
+    if ap_type is not None:
+        evaluator.ap_type = ap_type
+
 
 class LayerBase(object):
     def __init__(
@@ -3124,11 +3139,11 @@ def Layer(name, type, **xargs):
 @config_func
 def ParameterHook(type, **kwargs):
     if type == 'pruning':
-        mask_filename = kwargs.get('mask_filename', None)
-        assert mask_filename is not None
         hook = ParameterUpdaterHookConfig()
         hook.type = type
-        hook.purning_mask_filename = mask_filename
+        sparsity_ratio = kwargs.get('sparsity_ratio', None)
+        if sparsity_ratio is not None:
+            hook.sparsity_ratio = sparsity_ratio
         return hook
     else:
         return None
@@ -3236,13 +3251,13 @@ def Parameter(name,
 
     if update_hooks is not None:
         if hasattr(update_hooks, '__call__'):
-            update_hooks = update_hooks(para.name)
+            update_hooks = update_hooks()
 
         if isinstance(update_hooks, list):
             for hook in update_hooks:
                 para.update_hooks.extend([hook])
         else:
-            para.update_hooks.extend(update_hooks)
+            para.update_hooks.extend([update_hooks])
 
     g_parameter_map[name] = para
     if initializer is not None:
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 4100697c9c3770f1b748ea630d5f8193167fe7fc..9b9f979bb615f37ec1dc9baa154d28741b1400d5 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -14,7 +14,8 @@
 
 from paddle.trainer.config_parser import *
 __all__ = [
-    'ParamAttr', 'ExtraAttr', 'ParameterAttribute', 'ExtraLayerAttribute'
+    'HookAttr', 'ParamAttr', 'ExtraAttr', 'ParameterAttribute',
+    'ExtraLayerAttribute'
 ]
 
 
@@ -55,6 +56,40 @@ def is_compatible_with(x, Type):
         return False
 
 
+class HookAttribute(object):
+    """
+    Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs 
+    during training process of a layer with parameters, such as img_conv layer, fc layer.
+
+    :param  type: Hook type, currently supported types: 
+                        'pruning' :  user specify a sparsity_ratio before training started, and the
+                            network will prune the parameters based on the sparsity_ratio. 
+                            eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
+                            The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
+                                                                       num_channels=3, num_filters=64,
+                                                                       param_attr=ParameterAttribute(update_hooks=hk) )
+                            The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
+    :type type: string
+
+    :param sparsity_ratio: Must be specified if hook type is 'pruning', 
+                        it represents the ratio of the zero elements to be set by the Parameter.
+    :type sparsity_ratio: float or None
+	
+    """
+
+    def __init__(self, type, sparsity_ratio=None):
+        self.type = type
+        self.sparsity_ratio = sparsity_ratio
+        if self.sparsity_ratio is not None:
+            assert is_compatible_with(
+                self.sparsity_ratio,
+                float), 'sparisity_ratio must be float type'
+            assert self.sparsity_ratio <= 1 and self.sparsity_ratio >= 0, 'sparsity_ratio must be a float between [0, 1] '
+
+    def __call__(self):
+        return ParameterHook(self.type, sparsity_ratio=self.sparsity_ratio)
+
+
 class ParameterAttribute(object):
     """
     Parameter Attributes object. To fine-tuning network training process, user
@@ -114,6 +149,7 @@ class ParameterAttribute(object):
                  momentum=None,
                  gradient_clipping_threshold=None,
                  sparse_update=False,
+                 update_hooks=None,
                  initializer=None):
         self.attr = {}
 
@@ -169,6 +205,9 @@ class ParameterAttribute(object):
         if initializer is not None:
             self.attr['initializer'] = initializer
 
+        if update_hooks:
+            self.attr['update_hooks'] = update_hooks
+
     def set_default_parameter_name(self, name):
         """
         Set default parameter name. If parameter not set, then will use default
@@ -244,5 +283,6 @@ class ExtraLayerAttribute(object):
             return attr.attr
 
 
+HookAttr = HookAttribute
 ParamAttr = ParameterAttribute
 ExtraAttr = ExtraLayerAttribute
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index a5234f3e47f6caa4b365de593648e0ee5ad6e4a2..44d52edfa7bae49bea196eba9387391b171840d8 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -21,7 +21,8 @@ __all__ = [
     "chunk_evaluator", "sum_evaluator", "column_sum_evaluator",
     "value_printer_evaluator", "gradient_printer_evaluator",
     "maxid_printer_evaluator", "maxframe_printer_evaluator",
-    "seqtext_printer_evaluator", "classification_error_printer_evaluator"
+    "seqtext_printer_evaluator", "classification_error_printer_evaluator",
+    "detection_map_evaluator"
 ]
 
 
@@ -31,10 +32,11 @@ class EvaluatorAttribute(object):
     FOR_RANK = 1 << 2
     FOR_PRINT = 1 << 3
     FOR_UTILS = 1 << 4
+    FOR_DETECTION = 1 << 5
 
     KEYS = [
         "for_classification", "for_regression", "for_rank", "for_print",
-        "for_utils"
+        "for_utils", "for_detection"
     ]
 
     @staticmethod
@@ -57,22 +59,25 @@ def evaluator(*attrs):
     return impl
 
 
-def evaluator_base(
-        input,
-        type,
-        label=None,
-        weight=None,
-        name=None,
-        chunk_scheme=None,
-        num_chunk_types=None,
-        classification_threshold=None,
-        positive_label=None,
-        dict_file=None,
-        result_file=None,
-        num_results=None,
-        delimited=None,
-        top_k=None,
-        excluded_chunk_types=None, ):
+def evaluator_base(input,
+                   type,
+                   label=None,
+                   weight=None,
+                   name=None,
+                   chunk_scheme=None,
+                   num_chunk_types=None,
+                   classification_threshold=None,
+                   positive_label=None,
+                   dict_file=None,
+                   result_file=None,
+                   num_results=None,
+                   delimited=None,
+                   top_k=None,
+                   excluded_chunk_types=None,
+                   overlap_threshold=None,
+                   background_id=None,
+                   evaluate_difficult=None,
+                   ap_type=None):
     """
     Evaluator will evaluate the network status while training/testing.
 
@@ -107,6 +112,14 @@ def evaluator_base(
     :type weight: LayerOutput.
     :param top_k: number k in top-k error rate
     :type top_k: int
+    :param overlap_threshold: In detection tasks to filter detection results
+    :type overlap_threshold: float
+    :param background_id: Identifier of background class
+    :type background_id: int
+    :param evaluate_difficult: Whether to evaluate difficult objects
+    :type evaluate_difficult: bool
+    :param ap_type: How to calculate average persicion
+    :type ap_type: str
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
@@ -136,7 +149,61 @@ def evaluator_base(
         delimited=delimited,
         num_results=num_results,
         top_k=top_k,
-        excluded_chunk_types=excluded_chunk_types, )
+        excluded_chunk_types=excluded_chunk_types,
+        overlap_threshold=overlap_threshold,
+        background_id=background_id,
+        evaluate_difficult=evaluate_difficult,
+        ap_type=ap_type)
+
+
+@evaluator(EvaluatorAttribute.FOR_DETECTION)
+@wrap_name_default()
+def detection_map_evaluator(input,
+                            label,
+                            overlap_threshold=0.5,
+                            background_id=0,
+                            evaluate_difficult=False,
+                            ap_type="11point",
+                            name=None):
+    """
+    Detection mAP Evaluator. It will print mean Average Precision (mAP) for detection.
+
+    The detection mAP Evaluator based on the output of detection_output layer counts
+    the true positive and the false positive bbox and integral them to get the
+    mAP.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       eval =  detection_map_evaluator(input=det_output,label=lbl)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param label: Label layer.
+    :type label: LayerOutput
+    :param overlap_threshold: The bbox overlap threshold of a true positive.
+    :type overlap_threshold: float
+    :param background_id: The background class index.
+    :type background_id: int
+    :param evaluate_difficult: Whether evaluate a difficult ground truth.
+    :type evaluate_difficult: bool
+    """
+    if not isinstance(input, list):
+        input = [input]
+
+    if label:
+        input.append(label)
+
+    evaluator_base(
+        name=name,
+        type="detection_map",
+        input=input,
+        label=label,
+        overlap_threshold=overlap_threshold,
+        background_id=background_id,
+        evaluate_difficult=evaluate_difficult,
+        ap_type=ap_type)
 
 
 @evaluator(EvaluatorAttribute.FOR_CLASSIFICATION)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b8ce0373c0e9524518e42ad911fd2cd728facec4..a601d5c84ad222785e68b9fa81c51b1e120b4f29 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1149,10 +1149,10 @@ def pooling_layer(input,
 @layer_support(DROPOUT)
 def lstmemory(input,
               name=None,
+              size=None,
               reverse=False,
               act=None,
               gate_act=None,
-              size=None,
               state_act=None,
               bias_attr=None,
               param_attr=None,
@@ -1194,6 +1194,8 @@ def lstmemory(input,
 
     :param name: The lstmemory layer name.
     :type name: basestring
+    :param size: DEPRECATED. size of the lstm cell
+    :type size: int
     :param input: input layer name.
     :type input: LayerOutput
     :param reverse: is sequence process reversed or not.
@@ -1220,15 +1222,15 @@ def lstmemory(input,
     assert state_act.support_hppl
     assert act.support_hppl
     assert input.size is not None and input.size % 4 == 0
+
     if size is not None:
         if input.size / 4 == size:
             plog = logger.warning
         else:
             plog = logger.fatal
-
-        plog("NOTE: The lstmemory layer[%s]'s size is set by previous input "
-             "layer. The lstm size should be equal with input layer size/4. The"
-             " size which is set explicitly will be ignored." % name)
+        plog("size of lstmemory layer: %s is automatically set to "
+             "size of input layer / 4. The parameter size passing to "
+             "this layer is ignored." % (name))
 
     Layer(
         name=name,
@@ -1255,11 +1257,11 @@ def lstmemory(input,
 @wrap_name_default("gru")
 @layer_support(DROPOUT)
 def grumemory(input,
+              size=None,
               name=None,
               reverse=False,
               act=None,
               gate_act=None,
-              size=None,
               bias_attr=None,
               param_attr=None,
               layer_attr=None):
@@ -1318,6 +1320,8 @@ def grumemory(input,
     :type name: None|basestring
     :param input: input layer.
     :type input: LayerOutput.
+    :param size: DEPRECATED. size of the gru cell
+    :type size: int
     :param reverse: Whether sequence process is reversed or not.
     :type reverse: bool
     :param act: activation type, TanhActivation by default. This activation
@@ -1334,9 +1338,6 @@ def grumemory(input,
     :type param_attr: ParameterAttribute|None|False
     :param layer_attr: Extra Layer attribute
     :type layer_attr: ExtraLayerAttribute|None
-    :param size: Stub parameter of size, but actually not used. If set this size
-                 will get a warning.
-    :type size: None
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -1348,9 +1349,9 @@ def grumemory(input,
             plog = logger.warning
         else:
             plog = logger.fatal
-        plog("NOTE: the gru memory layer's size is set by previous input layer,"
-             " and should be input size / 3. Set size explicitly will be "
-             "ignored.")
+        plog("size of grumemory layer: %s is automatically set to "
+             "size of input layer / 3. The parameter size passing to this "
+             "layer is ignored." % (name))
 
     Layer(
         name=name,
@@ -2524,8 +2525,8 @@ def img_cmrnorm_layer(input,
 
 
 @wrap_bias_attr_default()
-@wrap_param_attr_default(default_factory=lambda _: ParamAttr(initial_mean=1.0,
-                                                             initial_std=0.))
+@wrap_param_attr_default(
+    default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
 @layer_support(DROPOUT)
@@ -3013,25 +3014,25 @@ def lstm_step_layer(input,
                     bias_attr=None,
                     layer_attr=None):
     """
-    LSTM Step Layer. It used in recurrent_group. The lstm equations are shown
-    as follow.
+    LSTM Step Layer. This function is used only in recurrent_group.
+    The lstm equations are shown as follows.
 
     ..  math::
 
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 
         h_t & = o_t tanh(c_t)
 
 
     The input of lstm step is :math:`Wx_t + Wh_{t-1}`, and user should use
     :code:`mixed_layer` and :code:`full_matrix_projection` to calculate these
-    input vector.
+    input vectors.
 
     The state of lstm step is :math:`c_{t-1}`. And lstm step layer will do
 
@@ -3042,14 +3043,14 @@ def lstm_step_layer(input,
         ...
 
 
-    This layer contains two outputs. Default output is :math:`h_t`. The other
-    output is :math:`o_t`, which name is 'state' and can use
+    This layer has two outputs. Default output is :math:`h_t`. The other
+    output is :math:`o_t`, whose name is 'state' and can use
     :code:`get_output_layer` to extract this output.
 
     :param name: Layer's name.
     :type name: basestring
-    :param size: Layer's size. NOTE: lstm layer's size, should be equal as
-                 :code:`input.size/4`, and should be equal as
+    :param size: Layer's size. NOTE: lstm layer's size, should be equal to
+                 :code:`input.size/4`, and should be equal to
                  :code:`state.size`.
     :type size: int
     :param input: input layer. :math:`Wx_t + Wh_{t-1}`
@@ -3839,7 +3840,8 @@ def classification_cost(input,
                         weight=None,
                         name=None,
                         evaluator=classification_error_evaluator,
-                        layer_attr=None):
+                        layer_attr=None,
+                        coeff=1.):
     """
     classification cost Layer.
 
@@ -3855,6 +3857,8 @@ def classification_cost(input,
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param coeff: The coefficient affects the gradient in the backward.
+    :type coeff: float
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -3868,6 +3872,7 @@ def classification_cost(input,
         name=name,
         type="multi-class-cross-entropy",
         inputs=ipts,
+        coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
 
     def __add_evaluator__(e):
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 1bf59ed4840ae69afc5bce49c86a08b60e9603ee..b77932ce5f09470329a97cc0a6273942a9155c6a 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -614,6 +614,7 @@ def simple_lstm(input,
 
 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
+                   memory_boot=None,
                    name=None,
                    size=None,
                    param_attr=None,
@@ -626,9 +627,9 @@ def lstmemory_unit(input,
                    lstm_layer_attr=None,
                    get_output_layer_attr=None):
     """
-    Define calculations that a LSTM unit performs in a single time step.
-    This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is always used in
+    Define calculations that a LSTM unit performs during a single time step.
+    This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
     recurrent_group (see layers.py for more details) to implement attention
     mechanism.
 
@@ -638,13 +639,13 @@ def lstmemory_unit(input,
 
     ..  math::
 
-        i_t & = \\sigma(W_{xi}x_{t} + W_{hi}h_{t-1} + W_{ci}c_{t-1} + b_i)
+        i_t & = \\sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i)
 
-        f_t & = \\sigma(W_{xf}x_{t} + W_{hf}h_{t-1} + W_{cf}c_{t-1} + b_f)
+        f_t & = \\sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f)
 
-        c_t & = f_tc_{t-1} + i_t tanh (W_{xc}x_t+W_{hc}h_{t-1} + b_c)
+        c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c)
 
-        o_t & = \\sigma(W_{xo}x_{t} + W_{ho}h_{t-1} + W_{co}c_t + b_o)
+        o_t & = \\sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o)
 
         h_t & = o_t tanh(c_t)
 
@@ -661,6 +662,8 @@ def lstmemory_unit(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: lstmemory unit name.
     :type name: basestring
     :param size: lstmemory unit size.
@@ -692,7 +695,8 @@ def lstmemory_unit(input,
         assert input.size % 4 == 0
         size = input.size / 4
     out_mem = memory(name=name, size=size)
-    state_mem = memory(name="%s_state" % name, size=size)
+    state_mem = memory(
+        name="%s_state" % name, size=size, boot_layer=memory_boot)
 
     with mixed_layer(
             name="%s_input_recurrent" % name,
@@ -726,6 +730,7 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                     size=None,
                     name=None,
+                    memory_boot=None,
                     reverse=False,
                     param_attr=None,
                     act=None,
@@ -737,7 +742,7 @@ def lstmemory_group(input,
                     lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    lstm_group is a recurrent layer group version of Long Short Term Memory. It
+    lstm_group is a recurrent_group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
     layers.py for the maths) does. A promising benefit is that LSTM memory
     cell states, or hidden states in every time step are accessible to the
@@ -748,8 +753,8 @@ def lstmemory_group(input,
 
     NOTE: In PaddlePaddle's implementation, the following input-to-hidden
     multiplications:
-    :math:`W_{xi}x_{t}` , :math:`W_{xf}x_{t}`,
-    :math:`W_{xc}x_t`, :math:`W_{xo}x_{t}` are not done in lstmemory_unit to
+    :math:`W_{x_i}x_{t}` , :math:`W_{x_f}x_{t}`,
+    :math:`W_{x_c}x_t`, :math:`W_{x_o}x_{t}` are not done in lstmemory_unit to
     speed up the calculations. Consequently, an additional mixed_layer with
     full_matrix_projection must be included before lstmemory_unit is called.
 
@@ -765,10 +770,12 @@ def lstmemory_group(input,
 
     :param input: input layer name.
     :type input: LayerOutput
-    :param name: lstmemory group name.
-    :type name: basestring
     :param size: lstmemory group size.
     :type size: int
+    :param name: name of the lstmemory group.
+    :type name: basestring
+    :param memory_boot: the initialization state of LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param reverse: is lstm reversed
     :type reverse: bool
     :param param_attr: Parameter config, None if use default.
@@ -798,6 +805,7 @@ def lstmemory_group(input,
     def __lstm_step__(ipt):
         return lstmemory_unit(
             input=ipt,
+            memory_boot=memory_boot,
             name=name,
             size=size,
             mixed_bias_attr=mixed_bias_attr,
@@ -819,6 +827,7 @@ def lstmemory_group(input,
 
 @wrap_name_default('gru_unit')
 def gru_unit(input,
+             memory_boot=None,
              size=None,
              name=None,
              gru_bias_attr=None,
@@ -829,8 +838,8 @@ def gru_unit(input,
              naive=False):
     """
     Define calculations that a gated recurrent unit performs in a single time
-    step. This function itself is not a recurrent layer, so that it can not be
-    directly applied to sequence input. This function is almost always used in
+    step. This function itself is not a recurrent layer, so it can not be
+    directly used to process sequence inputs. This function is always used in
     the recurrent_group (see layers.py for more details) to implement attention
     mechanism.
 
@@ -838,6 +847,8 @@ def gru_unit(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
@@ -856,7 +867,7 @@ def gru_unit(input,
     if size is None:
         size = input.size / 3
 
-    out_mem = memory(name=name, size=size)
+    out_mem = memory(name=name, size=size, boot_layer=memory_boot)
 
     if naive:
         __step__ = gru_step_naive_layer
@@ -878,6 +889,7 @@ def gru_unit(input,
 
 @wrap_name_default('gru_group')
 def gru_group(input,
+              memory_boot=None,
               size=None,
               name=None,
               reverse=False,
@@ -888,7 +900,7 @@ def gru_group(input,
               gru_layer_attr=None,
               naive=False):
     """
-    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
+    gru_group is a recurrent_group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
     benefit is that gru hidden states are accessible to the user. This is
     especially useful in attention model. If you do not need to access
@@ -908,6 +920,8 @@ def gru_group(input,
 
     :param input: input layer name.
     :type input: LayerOutput
+    :param memory_boot: the initialization state of the LSTM cell.
+    :type memory_boot: LayerOutput | None
     :param name: name of the gru group.
     :type name: basestring
     :param size: hidden size of the gru.
@@ -929,6 +943,7 @@ def gru_group(input,
     def __gru_step__(ipt):
         return gru_unit(
             input=ipt,
+            memory_boot=memory_boot,
             name=name,
             size=size,
             gru_bias_attr=gru_bias_attr,
@@ -1083,7 +1098,6 @@ def simple_gru2(input,
 
     return grumemory(
         name=name,
-        size=size,
         input=m,
         reverse=reverse,
         bias_attr=gru_bias_attr,
@@ -1381,7 +1395,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(*[l.name for l in layers])
+    Inputs(* [l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1424,7 +1438,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(*[l.name for l in layers])
+        Outputs(* [l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
index 32f78614e7f8abe7cffdc7a50a9fa77f1fc1a780..5d23894d735c463d469f842b875ecbec1dbaf476 100644
--- a/python/paddle/v2/attr.py
+++ b/python/paddle/v2/attr.py
@@ -17,10 +17,12 @@ import paddle.trainer_config_helpers.attrs
 __all__ = [
     "Param",
     "Extra",
+    "Hook",
 ]
 
 Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
 Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
+Hook = paddle.trainer_config_helpers.attrs.HookAttribute
 
 for each in paddle.trainer_config_helpers.attrs.__all__:
     globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
index 26252d5bbd77ddb70b4f03843679e4737e2f96d3..2e4beb6882789249db09705f3f4d6c5c19e492cd 100644
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -25,8 +25,9 @@ import uci_housing
 import sentiment
 import wmt14
 import mq2007
+import flowers
 
 __all__ = [
     'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers'
 ]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index 81af0a8e66a44a3476206147684d81bcac1be372..f885b2834e8ad502b752c6fd53daf7ef1693433f 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -31,10 +31,10 @@ images per class.
 import cPickle
 import itertools
 import numpy
-from common import download
+import paddle.v2.dataset.common
 import tarfile
 
-__all__ = ['train100', 'test100', 'train10', 'test10']
+__all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
 
 URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
 CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
@@ -75,7 +75,8 @@ def train100():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
 
 
 def test100():
@@ -88,7 +89,9 @@ def test100():
     :return: Test reader creator.
     :rtype: callable
     """
-    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
 
 
 def train10():
@@ -102,7 +105,8 @@ def train10():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
 
 
 def test10():
@@ -116,9 +120,20 @@ def test10():
     :rtype: callable
     """
     return reader_creator(
-        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
 
 
 def fetch():
-    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
-    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+    paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train100(), 10, "cifar_train100")
+    paddle.v2.dataset.common.convert(path, test100(), 10, "cifar_test100")
+    paddle.v2.dataset.common.convert(path, train10(), 10, "cifar_train10")
+    paddle.v2.dataset.common.convert(path, test10(), 10, "cifar_test10")
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index e09ac1a7a0fe70dbf58a04f51cdf6916485e9be1..4a2eb59c340f5d0d3818170e56d730330e0bab29 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -23,17 +23,24 @@ import paddle.v2.dataset
 import cPickle
 import glob
 
-__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
+__all__ = [
+    'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
+    'convert'
+]
 
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
 
-if not os.path.exists(DATA_HOME):
-    try:
-        os.makedirs(DATA_HOME)
-    except OSError as exc:
-        if exc.errno != errno.EEXIST:
-            raise
-        pass
+# When running unit tests, there could be multiple processes that
+# trying to create DATA_HOME directory simultaneously, so we cannot
+# use a if condition to check for the existence of the directory;
+# instead, we use the filesystem as the synchronization mechanism by
+# catching returned errors.
+try:
+    os.makedirs(DATA_HOME)
+except OSError as exc:
+    if exc.errno != errno.EEXIST:
+        raise
+    pass
 
 
 def md5file(fname):
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 12d648bf6557ed6e437320e56a80294abac29f18..f8aae52e7c29d86c7da9c1da0dd1d093634d4567 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -23,9 +23,9 @@ to initialize SRL model.
 import tarfile
 import gzip
 import itertools
-from common import download
+import paddle.v2.dataset.common
 
-__all__ = ['test, get_dict', 'get_embedding']
+__all__ = ['test, get_dict', 'get_embedding', 'convert']
 
 DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
@@ -182,9 +182,15 @@ def get_dict():
     """
     Get the word, verb and label dictionary of Wikipedia corpus.
     """
-    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
-    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
-    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    word_dict = load_dict(
+        paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st',
+                                          WORDDICT_MD5))
+    verb_dict = load_dict(
+        paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st',
+                                          VERBDICT_MD5))
+    label_dict = load_dict(
+        paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st',
+                                          TRGDICT_MD5))
     return word_dict, verb_dict, label_dict
 
 
@@ -192,7 +198,7 @@ def get_embedding():
     """
     Get the trained word vector based on Wikipedia corpus.
     """
-    return download(EMB_URL, 'conll05st', EMB_MD5)
+    return paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
 
 
 def test():
@@ -209,15 +215,23 @@ def test():
     """
     word_dict, verb_dict, label_dict = get_dict()
     reader = corpus_reader(
-        download(DATA_URL, 'conll05st', DATA_MD5),
+        paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5),
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
 def fetch():
-    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
-    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
-    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
-    download(EMB_URL, 'conll05st', EMB_MD5)
-    download(DATA_URL, 'conll05st', DATA_MD5)
+    paddle.v2.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    paddle.v2.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    paddle.v2.dataset.common.download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    paddle.v2.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
+    paddle.v2.dataset.common.download(DATA_URL, 'conll05st', DATA_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "conl105_test")
diff --git a/python/paddle/v2/dataset/flowers.py b/python/paddle/v2/dataset/flowers.py
index 07c13cf719ae0c864c23fef51f0bd7d47f265759..158cfe158c4f1c8d82d157301adcfbe0351c55df 100644
--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -13,18 +13,18 @@
 # limitations under the License.
 """
 This module will download dataset from
-http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html 
+http://www.robots.ox.ac.uk/~vgg/data/flowers/102/index.html
 and parse train/test set intopaddle reader creators.
 
-This set contains images of flowers belonging to 102 different categories. 
+This set contains images of flowers belonging to 102 different categories.
 The images were acquired by searching the web and taking pictures. There are a
 minimum of 40 images for each category.
 
 The database was used in:
 
 Nilsback, M-E. and Zisserman, A. Automated flower classification over a large
- number of classes.Proceedings of the Indian Conference on Computer Vision, 
-Graphics and Image Processing (2008) 
+ number of classes.Proceedings of the Indian Conference on Computer Vision,
+Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 
 """
@@ -34,9 +34,9 @@ from common import download
 import tarfile
 import scipy.io as scio
 from paddle.v2.image import *
+from paddle.v2.reader import *
 import os
 import numpy as np
-import paddle.v2 as paddle
 from multiprocessing import cpu_count
 __all__ = ['train', 'test', 'valid']
 
@@ -46,6 +46,12 @@ SETID_URL = 'http://www.robots.ox.ac.uk/~vgg/data/flowers/102/setid.mat'
 DATA_MD5 = '52808999861908f626f3c1f4e79d11fa'
 LABEL_MD5 = 'e0620be6f572b9609742df49c70aed4d'
 SETID_MD5 = 'a5357ecc9cb78c4bef273ce3793fc85c'
+# In official 'readme', tstid is the flag of test data
+# and trnid is the flag of train data. But test data is more than train data.
+# So we exchange the train data and test data.
+TRAIN_FLAG = 'tstid'
+TEST_FLAG = 'trnid'
+VALID_FLAG = 'valid'
 
 
 def default_mapper(sample):
@@ -53,8 +59,8 @@ def default_mapper(sample):
     map image bytes data to type needed by model input layer
     '''
     img, label = sample
-    img = paddle.image.load_image_bytes(img)
-    img = paddle.image.simple_transform(img, 256, 224, True)
+    img = load_image_bytes(img)
+    img = simple_transform(img, 256, 224, True)
     return img.flatten().astype('float32'), label
 
 
@@ -63,22 +69,23 @@ def reader_creator(data_file,
                    setid_file,
                    dataset_name,
                    mapper=default_mapper,
-                   buffered_size=1024):
+                   buffered_size=1024,
+                   use_xmap=True):
     '''
-    1. read images from tar file and 
+    1. read images from tar file and
         merge images into batch files in 102flowers.tgz_batch/
     2. get a reader to read sample from batch file
-    
-    :param data_file: downloaded data file 
+
+    :param data_file: downloaded data file
     :type data_file: string
-    :param label_file: downloaded label file 
+    :param label_file: downloaded label file
     :type label_file: string
     :param setid_file: downloaded setid file containing information
                         about how to split dataset
     :type setid_file: string
     :param dataset_name: data set name (tstid|trnid|valid)
     :type dataset_name: string
-    :param mapper: a function to map image bytes data to type 
+    :param mapper: a function to map image bytes data to type
                     needed by model input layer
     :type mapper: callable
     :param buffered_size: the size of buffer used to process images
@@ -105,15 +112,17 @@ def reader_creator(data_file,
             for sample, label in itertools.izip(data, batch['label']):
                 yield sample, int(label)
 
-    return paddle.reader.xmap_readers(mapper, reader,
-                                      cpu_count(), buffered_size)
+    if use_xmap:
+        return xmap_readers(mapper, reader, cpu_count(), buffered_size)
+    else:
+        return map_readers(mapper, reader)
 
 
-def train(mapper=default_mapper, buffered_size=1024):
+def train(mapper=default_mapper, buffered_size=1024, use_xmap=True):
     '''
-    Create flowers training set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers training set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -128,15 +137,15 @@ def train(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'trnid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), TRAIN_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
-def test(mapper=default_mapper, buffered_size=1024):
+def test(mapper=default_mapper, buffered_size=1024, use_xmap=True):
     '''
-    Create flowers test set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers test set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -151,15 +160,15 @@ def test(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'tstid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), TEST_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
-def valid(mapper=default_mapper, buffered_size=1024):
+def valid(mapper=default_mapper, buffered_size=1024, use_xmap=True):
     '''
-    Create flowers validation set reader. 
-    It returns a reader, each sample in the reader is   
-    image pixels in [0, 1] and label in [1, 102] 
+    Create flowers validation set reader.
+    It returns a reader, each sample in the reader is
+    image pixels in [0, 1] and label in [1, 102]
     translated from original color image by steps:
     1. resize to 256*256
     2. random crop to 224*224
@@ -174,8 +183,8 @@ def valid(mapper=default_mapper, buffered_size=1024):
     return reader_creator(
         download(DATA_URL, 'flowers', DATA_MD5),
         download(LABEL_URL, 'flowers', LABEL_MD5),
-        download(SETID_URL, 'flowers', SETID_MD5), 'valid', mapper,
-        buffered_size)
+        download(SETID_URL, 'flowers', SETID_MD5), VALID_FLAG, mapper,
+        buffered_size, use_xmap)
 
 
 def fetch():
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 5dc5abfe53d90ec3adc9a27a49ed086953146497..c0ec5992e0e6b0a2fd2359910d0f7a6c690c2ec3 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -28,7 +28,7 @@ import re
 import string
 import threading
 
-__all__ = ['build_dict', 'train', 'test']
+__all__ = ['build_dict', 'train', 'test', 'convert']
 
 URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
 MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
@@ -166,3 +166,12 @@ def word_dict():
 
 def fetch():
     paddle.v2.dataset.common.download(URL, 'imdb', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    w = word_dict()
+    paddle.v2.dataset.common.convert(path, lambda: train(w), 10, "imdb_train")
+    paddle.v2.dataset.common.convert(path, lambda: test(w), 10, "imdb_test")
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index dd3a4552d2e1a2b00dde5ddb7ac1d78445bdca51..b18ee8e9ba91e0e8ccf061223b3c0d4636442956 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -22,7 +22,7 @@ import paddle.v2.dataset.common
 import collections
 import tarfile
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = ['train', 'test', 'build_dict', 'convert']
 
 URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
 MD5 = '30177ea32e27c525793142b6bf2c8e2d'
@@ -146,3 +146,15 @@ def test(word_idx, n, data_type=DataType.NGRAM):
 
 def fetch():
     paddle.v2.dataset.common.download(URL, "imikolov", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    N = 5
+    word_dict = build_dict()
+    paddle.v2.dataset.common.convert(path,
+                                     train(word_dict, N), 10, "imikolov_train")
+    paddle.v2.dataset.common.convert(path,
+                                     test(word_dict, N), 10, "imikolov_test")
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 435556b2921b7976bbc61160ce3812949981c9e7..ea5891f4f3f6ee1c5023cccee9732cbd9d78b881 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -21,7 +21,7 @@ import paddle.v2.dataset.common
 import subprocess
 import numpy
 import platform
-__all__ = ['train', 'test']
+__all__ = ['train', 'test', 'convert']
 
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
 TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
@@ -113,3 +113,11 @@ def fetch():
     paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
     paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
     paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 10, "minist_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "minist_test")
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index 837a85912663826f0483aff4f6a38f3945375d82..d9372d422a3293eddeb7c0d5b7c8980f55c44690 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -23,14 +23,15 @@ set and test set into paddle reader creators.
 """
 
 import zipfile
-from common import download
+import paddle.v2.dataset.common
 import re
 import random
 import functools
 
 __all__ = [
     'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
-    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info',
+    'convert'
 ]
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
@@ -99,7 +100,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = download(URL, "movielens", MD5)
+    fn = paddle.v2.dataset.common.download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -246,7 +247,15 @@ def unittest():
 
 
 def fetch():
-    download(URL, "movielens", MD5)
+    paddle.v2.dataset.common.download(URL, "movielens", MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 10, "movielens_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "movielens_test")
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 4dd34e7383fe2a290fcf61474914183a383e2b9c..e33f120c8734621fd60497298d993e6e43bd06e0 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -26,9 +26,9 @@ from itertools import chain
 import nltk
 from nltk.corpus import movie_reviews
 
-import common
+import paddle.v2.dataset.common
 
-__all__ = ['train', 'test', 'get_word_dict']
+__all__ = ['train', 'test', 'get_word_dict', 'convert']
 NUM_TRAINING_INSTANCES = 1600
 NUM_TOTAL_INSTANCES = 2000
 
@@ -39,12 +39,13 @@ def download_data_if_not_yet():
     """
     try:
         # make sure that nltk can find the data
-        if common.DATA_HOME not in nltk.data.path:
-            nltk.data.path.append(common.DATA_HOME)
+        if paddle.v2.dataset.common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(paddle.v2.dataset.common.DATA_HOME)
         movie_reviews.categories()
     except LookupError:
         print "Downloading movie_reviews data set, please wait....."
-        nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+        nltk.download(
+            'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
         print "Download data set success....."
         print "Path is " + nltk.data.find('corpora/movie_reviews').path
 
@@ -128,4 +129,13 @@ def test():
 
 
 def fetch():
-    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+    nltk.download(
+        'movie_reviews', download_dir=paddle.v2.dataset.common.DATA_HOME)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train, 10, "sentiment_train")
+    paddle.v2.dataset.common.convert(path, test, 10, "sentiment_test")
diff --git a/python/paddle/v2/dataset/tests/flowers_test.py b/python/paddle/v2/dataset/tests/flowers_test.py
index cc0626f4feae287d18dfb227cc69a4174da055da..a8ae9a07acc22eb9d3c0cc5ebb07f8f11ed21233 100644
--- a/python/paddle/v2/dataset/tests/flowers_test.py
+++ b/python/paddle/v2/dataset/tests/flowers_test.py
@@ -31,13 +31,13 @@ class TestFlowers(unittest.TestCase):
     def test_train(self):
         instances, max_label_value = self.check_reader(
             paddle.v2.dataset.flowers.train())
-        self.assertEqual(instances, 1020)
+        self.assertEqual(instances, 6149)
         self.assertEqual(max_label_value, 102)
 
     def test_test(self):
         instances, max_label_value = self.check_reader(
             paddle.v2.dataset.flowers.test())
-        self.assertEqual(instances, 6149)
+        self.assertEqual(instances, 1020)
         self.assertEqual(max_label_value, 102)
 
     def test_valid(self):
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 3469fd9ce12dd4d934004f90286979b73048a5c8..ec10ce646ebf3eca2c2a6423b69ee11b6a2b99cf 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -21,7 +21,7 @@ parse training set and test set into paddle reader creators.
 
 import numpy as np
 import os
-from common import download
+import paddle.v2.dataset.common
 
 __all__ = ['train', 'test']
 
@@ -29,7 +29,7 @@ URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing
 MD5 = 'd4accdce7a25600298819f8e28e8d593'
 feature_names = [
     'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
-    'PTRATIO', 'B', 'LSTAT'
+    'PTRATIO', 'B', 'LSTAT', 'convert'
 ]
 
 UCI_TRAIN_DATA = None
@@ -82,7 +82,7 @@ def train():
     :rtype: callable
     """
     global UCI_TRAIN_DATA
-    load_data(download(URL, 'uci_housing', MD5))
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TRAIN_DATA:
@@ -102,7 +102,7 @@ def test():
     :rtype: callable
     """
     global UCI_TEST_DATA
-    load_data(download(URL, 'uci_housing', MD5))
+    load_data(paddle.v2.dataset.common.download(URL, 'uci_housing', MD5))
 
     def reader():
         for d in UCI_TEST_DATA:
@@ -112,4 +112,12 @@ def test():
 
 
 def fetch():
-    download(URL, 'uci_housing', MD5)
+    paddle.v2.dataset.common.download(URL, 'uci_housing', MD5)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    paddle.v2.dataset.common.convert(path, train(), 10, "uci_housing_train")
+    paddle.v2.dataset.common.convert(path, test(), 10, "uci_houseing_test")
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index 0902f87741c342b237439081703081b467dc6f35..e1dc4f4c30051202e8fd077087679c4fd6cbd7a0 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -22,10 +22,10 @@ parse training set and test set into paddle reader creators.
 import tarfile
 import gzip
 
-from paddle.v2.dataset.common import download
+import paddle.v2.dataset.common
 from paddle.v2.parameters import Parameters
 
-__all__ = ['train', 'test', 'build_dict']
+__all__ = ['train', 'test', 'build_dict', 'convert']
 
 URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
@@ -115,7 +115,8 @@ def train(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
 
 
 def test(dict_size):
@@ -130,16 +131,18 @@ def test(dict_size):
     :rtype: callable
     """
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
 
 
 def gen(dict_size):
     return reader_creator(
-        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size)
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'gen/gen', dict_size)
 
 
 def model():
-    tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
     with gzip.open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
@@ -148,7 +151,7 @@ def model():
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
     # else reverse = true, return dict = {'001':'a', '002':'b', ...}
-    tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    tar_file = paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
     if reverse:
         src_dict = {v: k for k, v in src_dict.items()}
@@ -157,5 +160,14 @@ def get_dict(dict_size, reverse=True):
 
 
 def fetch():
-    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
-    download(URL_MODEL, 'wmt14', MD5_MODEL)
+    paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    paddle.v2.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
+
+
+def convert(path):
+    """
+    Converts dataset to recordio format
+    """
+    dict_size = 30000
+    paddle.v2.dataset.common.convert(path, train(dict_size), 10, "wmt14_train")
+    paddle.v2.dataset.common.convert(path, test(dict_size), 10, "wmt14_test")
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index bbb9c3ea8c1b389f0ec9fd5ec7be52bd0449f52d..4ade1c6f329ae39769279963af6809f938807bdd 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -45,12 +45,12 @@ __all__ = ['data', 'parse_network']
 def __need_to_keep__(name):
     return name in [
         'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
-        'layer_support'
+        'layer_support', 'BaseGeneratedInput'
     ]
 
 
 def __need_to_wrap__(name):
-    return name not in ['AggregateLevel', 'ExpandLevel']
+    return name not in ['AggregateLevel', 'ExpandLevel', 'BaseGeneratedInput']
 
 
 def __convert_name__(inname):
@@ -199,6 +199,15 @@ def __get_used_submodels__(layer_names):
     return submodel_names
 
 
+def __get_submodel_data_out_links__():
+    data_links = set()
+    for submodel in cp.g_config.model_config.sub_models:
+        for link in submodel.out_links:
+            if cp.g_layer_map[link.link_name].type == 'data':
+                data_links.add(link.link_name)
+    return data_links
+
+
 def __get_used_evaluators__(layer_names):
     evaluator_names = set()
     for e in cp.g_config.model_config.evaluators:
@@ -264,6 +273,7 @@ def parse_network(output_layers, extra_layers=None):
     submodel_names = __get_used_submodels__(layer_names)
     submodel_names.add('root')
     evaluator_names = __get_used_evaluators__(layer_names)
+    data_out_links = __get_submodel_data_out_links__()
     input_layer_names = set()
     output_layer_names = set()
 
@@ -279,7 +289,7 @@ def parse_network(output_layers, extra_layers=None):
             continue
         model_config.layers.extend([l])
         if l.type == 'data':
-            if l.name in model_config.output_layer_names:
+            if l.name in data_out_links:
                 """
                 In text generation, the outlink to save the generated word
                 indices is a data_layer defined in recurrent_group. This
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index ad20241b98302f136326ae491c6723a6c12ae284..bbaf8bfa979fbbf460561ebf7077b75b9c41a11a 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -51,7 +51,7 @@ class Parameters(object):
     def __init__(self):
         self.__param_conf__ = dict()
         self.__gradient_machines__ = []
-        self.__tmp_params__ = []
+        self.__tmp_params__ = dict()
 
     def __append_config__(self, param_conf):
         """
@@ -128,13 +128,10 @@ class Parameters(object):
 
         if len(self.__gradient_machines__) == 0:
             # create new parameter in python numpy.
-            if len(self.__tmp_params__) != 0:
-                ret_list = [
-                    mat for name, mat in self.__tmp_params__ if name == key
-                ]
-                if len(ret_list) == 1:
-                    return ret_list[0]
-            return np.ndarray(shape=shape, dtype=np.float32)
+            if key in self.__tmp_params__:
+                return self.__tmp_params__[key]
+            else:
+                return np.ndarray(shape=shape, dtype=np.float32)
         else:
             for each_gradient_machine in self.__gradient_machines__:
                 param = __get_parameter_in_gradient_machine__(
@@ -187,7 +184,7 @@ class Parameters(object):
                              (shape, value.shape))
 
         if len(self.__gradient_machines__) == 0:
-            self.__tmp_params__.append((key, value))
+            self.__tmp_params__[key] = value
         else:
             for each_gradient_machine in self.__gradient_machines__:
                 __copy_parameter_to_gradient_machine__(each_gradient_machine,
@@ -231,7 +228,7 @@ class Parameters(object):
             raise ValueError("gradient_machine should be api.GradientMachine")
 
         if len(self.__tmp_params__) != 0:
-            for name, val in self.__tmp_params__:
+            for name, val in self.__tmp_params__.iteritems():
                 try:
                     __copy_parameter_to_gradient_machine__(gradient_machine,
                                                            name, val)
@@ -287,6 +284,18 @@ class Parameters(object):
 
     @staticmethod
     def from_tar(f):
+        """
+        Create a `Parameters` object from the given file. And
+        the `Parameters` only contains the parameters in this
+        file. It is adapted the parameters are same in the
+        defined network and the given file. For example, it
+        can be used in the inference.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: A Parameters object.
+        :rtype: Parameters.
+        """
         params = Parameters()
         tar = tarfile.TarFile(fileobj=f, mode='r')
         for finfo in tar:
@@ -302,6 +311,21 @@ class Parameters(object):
             params.deserialize(param_name, f)
         return params
 
+    def init_from_tar(self, f):
+        """
+        Different from `from_tar`, this interface can be used to
+        init partial network parameters from another saved model.
+
+        :param f: the initialized model file.
+        :type f: tar file
+        :return: Nothing.
+        """
+
+        tar_param = Parameters.from_tar(f)
+        for pname in tar_param.names():
+            if pname in self.names():
+                self.set(pname, tar_param.get(pname))
+
 
 def __get_parameter_in_gradient_machine__(gradient_machine, name):
     """
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
index 07142056f872db5113acdd296b17c52b343c1be6..9f888b16d6b2fbf457ee4f4fe94fcb51b6f37fc9 100644
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could be used in user
 program.
 """
 
-__all__ = ['np_array', 'text_file']
+__all__ = ['np_array', 'text_file', "recordio"]
 
 
 def np_array(x):
@@ -55,3 +55,24 @@ def text_file(path):
         f.close()
 
     return reader
+
+
+def recordio(path):
+    """
+    Creates a data reader that outputs record one one by one from given recordio file
+    :path: path of recordio file
+    :returns: data reader of recordio file
+    """
+
+    import recordio as rec
+
+    def reader():
+        f = rec.reader(path)
+        while True:
+            r = f.read()
+            if r is None:
+                break
+            yield r
+        f.close()
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
index e432003129d2b8dea60138d08f13ec5e9d29a7ad..45a4288751e37b99dd1005ec78f30a98044926ff 100644
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -166,12 +166,12 @@ def buffered(reader, size):
     The buffered data reader will read and save data entries into a
     buffer. Reading from the buffered data reader will proceed as long
     as the buffer is not empty.
-    
+
     :param reader: the data reader to read from.
     :type reader: callable
     :param size: max buffer size.
     :type size: int
-    
+
     :returns: the buffered data reader.
     """
 
@@ -238,7 +238,7 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     :type mapper: callable
     :param reader: the data reader to read from
     :type reader: callable
-    :param process_num: process number to handle original sample 
+    :param process_num: process number to handle original sample
     :type process_num: int
     :param buffer_size: max buffer size
     :type buffer_size: int
@@ -248,9 +248,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
     :rtype: callable
     """
     end = XmapEndSignal()
-    in_queue = Queue(buffer_size)
-    out_queue = Queue(buffer_size)
-    out_order = [0]
 
     # define a worker to read samples from reader to in_queue
     def read_worker(reader, in_queue):
@@ -266,12 +263,6 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
             in_order += 1
         in_queue.put(end)
 
-    # start a read worker in a thread
-    target = order_read_worker if order else read_worker
-    t = Thread(target=target, args=(reader, in_queue))
-    t.daemon = True
-    t.start()
-
     # define a worker to handle samples from in_queue by mapper
     # and put mapped samples into out_queue
     def handle_worker(in_queue, out_queue, mapper):
@@ -298,19 +289,27 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
         in_queue.put(end)
         out_queue.put(end)
 
-    # start several handle_workers
-    target = order_handle_worker if order else handle_worker
-    args = (in_queue, out_queue, mapper, out_order) if order else (
-        in_queue, out_queue, mapper)
-    workers = []
-    for i in xrange(process_num):
-        worker = Thread(target=target, args=args)
-        worker.daemon = True
-        workers.append(worker)
-    for w in workers:
-        w.start()
-
     def xreader():
+        in_queue = Queue(buffer_size)
+        out_queue = Queue(buffer_size)
+        out_order = [0]
+        # start a read worker in a thread
+        target = order_read_worker if order else read_worker
+        t = Thread(target=target, args=(reader, in_queue))
+        t.daemon = True
+        t.start()
+        # start several handle_workers
+        target = order_handle_worker if order else handle_worker
+        args = (in_queue, out_queue, mapper, out_order) if order else (
+            in_queue, out_queue, mapper)
+        workers = []
+        for i in xrange(process_num):
+            worker = Thread(target=target, args=args)
+            worker.daemon = True
+            workers.append(worker)
+        for w in workers:
+            w.start()
+
         sample = out_queue.get()
         while not isinstance(sample, XmapEndSignal):
             yield sample
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
index 9f8d7133b8694aae5541eff9576eaba8a31e77dc..ba4f558874a0155d276fcb0e0d2d9258f0903f0e 100644
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -13,9 +13,7 @@
 # limitations under the License.
 import os
 import unittest
-
 import numpy as np
-
 import paddle.v2.reader.creator
 
 
@@ -36,5 +34,14 @@ class TestTextFile(unittest.TestCase):
             self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
 
 
+class TestRecordIO(unittest.TestCase):
+    def test_recordio(self):
+        path = os.path.join(
+            os.path.dirname(__file__), "test_recordio_creator.dat")
+        reader = paddle.v2.reader.creator.recordio(path)
+        for idx, r in enumerate(reader()):
+            self.assertSequenceEqual(r, str(idx))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
index bb3c5d220b9ce1552d2fc429abb1863930cd4d17..5a92951b100fa51ab6df7039d9c6b54d1f9d963e 100644
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -132,15 +132,17 @@ class TestXmap(unittest.TestCase):
         for order in orders:
             for tNum in thread_nums:
                 for size in buffered_size:
-                    result = []
-                    for i in paddle.v2.reader.xmap_readers(mapper,
+                    reader = paddle.v2.reader.xmap_readers(mapper,
                                                            reader_creator_10(0),
-                                                           tNum, size, order)():
-                        result.append(i)
-                    if not order:
-                        result.sort()
-                    for idx, e in enumerate(result):
-                        self.assertEqual(e, mapper(idx))
+                                                           tNum, size, order)
+                    for n in xrange(3):
+                        result = []
+                        for i in reader():
+                            result.append(i)
+                        if not order:
+                            result.sort()
+                        for idx, e in enumerate(result):
+                            self.assertEqual(e, mapper(idx))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/reader/tests/test_recordio_creator.dat b/python/paddle/v2/reader/tests/test_recordio_creator.dat
new file mode 100644
index 0000000000000000000000000000000000000000..17aa89b6796184407e83246d3f342a55a66b4a69
Binary files /dev/null and b/python/paddle/v2/reader/tests/test_recordio_creator.dat differ
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
index 45372e7dd0ec7cbdd6a2eb5c0397ef7e74284cd0..7ba8a939fbd1a949d61a007b40c054e7543c0cbc 100644
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -20,14 +20,17 @@ import cStringIO
 import numpy
 
 
-def __rand_param_config__(name):
+def __rand_param_config__(name, psize=None):
     conf = ParameterConfig()
     conf.name = name
     size = 1
-    for i in xrange(2):
-        dim = random.randint(1, 1000)
-        conf.dims.append(dim)
-        size *= dim
+    if psize is None:
+        for i in xrange(2):
+            dim = random.randint(1, 1000)
+            conf.dims.append(dim)
+            size *= dim
+    else:
+        size = psize
     conf.size = size
     assert conf.IsInitialized()
     return conf
@@ -77,6 +80,50 @@ class TestParameters(unittest.TestCase):
         expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
         assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
 
+    def test_init_from_tar(self):
+        def get_param(names, size):
+            p = parameters.Parameters()
+            for k, v in zip(names, size):
+                p.__append_config__(__rand_param_config__(k, v))
+            for name in p.names():
+                param = p.get(name)
+                param[:] = numpy.random.uniform(
+                    -1.0, 1.0, size=p.get_shape(name))
+                p.set(name, param)
+            return p
+
+        def get_parames():
+            name1 = ['param_0', 'param_1']
+            size1 = [128, 256]
+            p1 = get_param(name1, size1)
+            file1 = cStringIO.StringIO()
+            p1.to_tar(file1)
+            file1.seek(0)
+
+            name2 = ['param_0', 'param_1', 'param_2']
+            size2 = [128, 256, 288]
+            p2 = get_param(name2, size2)
+            file2 = cStringIO.StringIO()
+            p2.to_tar(file2)
+            file2.seek(0)
+            return p1, file1, p2, file2
+
+        p1, file1, p2, file2 = get_parames()
+        p2.init_from_tar(file1)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
+        p1, file1, p2, file2 = get_parames()
+        p1.init_from_tar(file2)
+        for name in p1.names():
+            self.assertEqual(p1.get_shape(name), p2.get_shape(name))
+            v1 = p1.get(name)
+            v2 = p2.get(name)
+            self.assertTrue(numpy.isclose(v1, v2).all())
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index 2e22f640cb55677b6814b7f26a71457a96449de7..aa6771709cad0bb4dd4ce39c81de7e6ab1ad4c73 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -13,8 +13,10 @@ packages=['paddle',
 setup_requires=["requests",
                 "numpy",
                 "protobuf==3.1",
+                "recordio",
                 "matplotlib",
-                "rarfile"]
+                "rarfile",
+                "scipy>=0.19.0"]
 
 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
     setup_requires+=["opencv-python"]