Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into row_conv

b783e08e · dangqingqing · b3ac51ff · e1dbb2fc · b783e08e · b783e08e
49 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,8 +3,8 @@
    hooks:
    -   id: remove-crlf
        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
-   repo: https://github.com/reyoung/mirrors-yapf.git
+-   repo: https://github.com/PaddlePaddle/mirrors-yapf.git
-    sha: v0.13.2
+    sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
    hooks:
    -   id: yapf
        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,7 +26,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
-    gflags
+    extern_gflags
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
    PREFIX          ${GFLAGS_SOURCES_DIR}
@@ -44,4 +44,8 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
+ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
+ADD_DEPENDENCIES(gflags extern_gflags)
 LIST(APPEND external_project_dependencies gflags)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -27,7 +27,7 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 ExternalProject_Add(
-    glog
+    extern_glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
@@ -48,4 +48,8 @@ ExternalProject_Add(
                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
+ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
+ADD_DEPENDENCIES(glog extern_glog)
 LIST(APPEND external_project_dependencies glog)
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -35,7 +35,7 @@ IF(WITH_TESTING)
    ENDIF(WIN32)
    ExternalProject_Add(
-        gtest
+        extern_gtest
        ${EXTERNAL_PROJECT_LOG_ARGS}
        GIT_REPOSITORY  "https://github.com/google/googletest.git"
        GIT_TAG         "release-1.8.0"
@@ -55,5 +55,14 @@ IF(WITH_TESTING)
                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                         -DCMAKE_BUILD_TYPE:STRING=Release
    )
-    LIST(APPEND external_project_dependencies gtest)
+    ADD_LIBRARY(gtest STATIC IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET gtest PROPERTY IMPORTED_LOCATION ${GTEST_LIBRARIES})
+    ADD_DEPENDENCIES(gtest extern_gtest)
+    ADD_LIBRARY(gtest_main STATIC IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
+    ADD_DEPENDENCIES(gtest_main extern_gtest)
+    LIST(APPEND external_project_dependencies gtest gtest_main)
 ENDIF(WITH_TESTING)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -41,7 +41,7 @@ IF(NOT ${CBLAS_FOUND})
    ENDIF()
    ExternalProject_Add(
-        openblas
+        extern_openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}
        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
        GIT_TAG             ${OPENBLAS_COMMIT}
@@ -53,8 +53,14 @@ IF(NOT ${CBLAS_FOUND})
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""
    )
-    LIST(APPEND external_project_dependencies openblas)
 ENDIF(NOT ${CBLAS_FOUND})
 MESSAGE(STATUS "BLAS library: ${CBLAS_LIBRARIES}")
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
+ADD_LIBRARY(cblas STATIC IMPORTED)
+SET_PROPERTY(TARGET cblas PROPERTY IMPORTED_LOCATION ${CBLAS_LIBRARIES})
+IF(NOT ${CBLAS_FOUND})
+    ADD_DEPENDENCIES(cblas extern_openblas)
+    LIST(APPEND external_project_dependencies cblas)
+ENDIF(NOT ${CBLAS_FOUND})
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -43,7 +43,7 @@ ELSE()
 ENDIF()
 ExternalProject_Add(
-    warpctc
+    extern_warpctc
    ${EXTERNAL_PROJECT_LOG_ARGS}
    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
    PREFIX          ${WARPCTC_SOURCES_DIR}
@@ -65,4 +65,8 @@ ExternalProject_Add(
                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
+ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
+ADD_DEPENDENCIES(warpctc extern_warpctc)
 LIST(APPEND external_project_dependencies warpctc)
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,7 +16,7 @@
 # To simplify the build process of PaddlePaddle, we defined couple of
 # fundamental abstractions, e.g., how to build library, binary and
 # test in C++, CUDA and Go.
-# 
+#
 # -------------------------------------------
 #    C++	      CUDA C++	      Go
 # -------------------------------------------
@@ -29,6 +29,11 @@
 # https://cmake.org/cmake/help/v3.0/module/CMakeParseArguments.html
 #
+if(NOT APPLE)
+    find_package(Threads REQUIRED)
+    link_libraries(${CMAKE_THREAD_LIBS_INIT})
+endif(NOT APPLE)
 # cc_library parses tensor.cc and figures out that target also depend on tensor.h.
 # cc_library(tensor
 #   SRCS
@@ -45,7 +50,9 @@ function(cc_library TARGET_NAME)
  else()
    add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
  endif()
-  add_dependencies(${TARGET_NAME} ${cc_library_DEPS} ${external_project_dependencies})
+  if (cc_library_DEPS)
+    add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
+  endif()
 endfunction(cc_library)
 # cc_binary parses tensor.cc and figures out that target also depend on tensor.h.
@@ -58,8 +65,7 @@ function(cc_binary TARGET_NAME)
  set(multiValueArgs SRCS DEPS)
  cmake_parse_arguments(cc_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
  add_executable(${TARGET_NAME} ${cc_binary_SRCS})
-  link_paddle_exe(${TARGET_NAME})
+  if(cc_binary_DEPS)
-  if(cc_binary_DEPS)  
    target_link_libraries(${TARGET_NAME} ${cc_binary_DEPS})
    add_dependencies(${TARGET_NAME} ${cc_binary_DEPS})
  endif()
@@ -73,17 +79,16 @@ endfunction(cc_binary)
 #   DEPS
 #   tensor)
 function(cc_test TARGET_NAME)
-  set(options "")
+  if(WITH_TESTING)
-  set(oneValueArgs "")
+    set(options "")
-  set(multiValueArgs SRCS DEPS)
+    set(oneValueArgs "")
-  cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(multiValueArgs SRCS DEPS)
-  add_executable(${TARGET_NAME} ${cc_test_SRCS})
+    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  link_paddle_test(${TARGET_NAME})
+    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-  if(cc_test_DEPS)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS})
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
-  add_test(${TARGET_NAME} ${TARGET_NAME})
 endfunction(cc_test)
 # Suppose that ops.cu includes global functions that take Tensor as
@@ -95,28 +100,33 @@ endfunction(cc_test)
 #   DEPS
 #   tensor)
 function(nv_library TARGET_NAME)
-  set(options OPTIONAL)
+  if (WITH_GPU)
-  set(oneValueArgs "")
+    set(options OPTIONAL)
-  set(multiValueArgs SRCS DEPS)
+    set(oneValueArgs "")
-  cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(multiValueArgs SRCS DEPS)
-  if (${nv_library_OPTIONAL} STREQUAL "SHARED")
+    cmake_parse_arguments(nv_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-    cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
+    if (${nv_library_OPTIONAL} STREQUAL "SHARED")
-  else()
+      cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
-    cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+    else()
+      cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
+    endif()
+    if (nv_library_DEPS)
+      add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
+    endif()
  endif()
-  add_dependencies(${TARGET_NAME} ${nv_library_DEPS} ${external_project_dependencies})
 endfunction(nv_library)
 function(nv_binary TARGET_NAME)
-  set(options "")
+  if (WITH_GPU)
-  set(oneValueArgs "")
+    set(options "")
-  set(multiValueArgs SRCS DEPS)
+    set(oneValueArgs "")
-  cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(multiValueArgs SRCS DEPS)
-  cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
+    cmake_parse_arguments(nv_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  link_paddle_exe(${TARGET_NAME})  
+    cuda_add_executable(${TARGET_NAME} ${nv_binary_SRCS})
-  if(nv_binary_DEPS)
+    if(nv_binary_DEPS)
-    target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
+      target_link_libraries(${TARGET_NAME} ${nv_binary_DEPS})
-    add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+      add_dependencies(${TARGET_NAME} ${nv_binary_DEPS})
+    endif()
  endif()
 endfunction(nv_binary)
@@ -128,17 +138,16 @@ endfunction(nv_binary)
 #   DEPS
 #   ops)
 function(nv_test TARGET_NAME)
-  set(options "")
+  if (WITH_GPU AND WITH_TESTING)
-  set(oneValueArgs "")
+    set(options "")
-  set(multiValueArgs SRCS DEPS)
+    set(oneValueArgs "")
-  cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+    set(multiValueArgs SRCS DEPS)
-  cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
+    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  link_paddle_test(${TARGET_NAME})  
+    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-  if(nv_test_DEPS)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS})
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS})
+    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
-  add_test(${TARGET_NAME} ${TARGET_NAME})
 endfunction(nv_test)
 set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
@@ -164,7 +173,7 @@ function(go_library TARGET_NAME)
      set(LIB_NAME "lib${TARGET_NAME}.dylib")
    else()
      set(LIB_NAME "lib${TARGET_NAME}.so")
-    endif()  
+    endif()
  else()
    set(BUILD_MODE "-buildmode=c-archive")
    set(LIB_NAME "lib${TARGET_NAME}.a")
@@ -190,8 +199,8 @@ function(go_binary TARGET_NAME)
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
    -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ${go_library_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})  
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})  
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_binary_DEPS})
  install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
@@ -204,8 +213,8 @@ function(go_test TARGET_NAME)
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ${go_test_SRCS}
-    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})  
+    WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})  
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)

--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@@ -10,7 +10,7 @@ if(WITH_RDMA)
  function(generate_rdma_links)
    #redirect to current DIR to isolate the pollution from system runtime environment
-    #it can benifits unified control for different gcc environment. 
+    #it can benifits unified control for different gcc environment.
    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
    #runtime libraries that will crash process while loading it. That redirect trick
    #can fix it.
@@ -19,7 +19,9 @@ if(WITH_RDMA)
      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
+      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
+      COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
    )
  endfunction(generate_rdma_links)
@@ -44,7 +46,7 @@ if(WITH_RDMA)
      RDMA_INC_XIO AND
      RDMA_INC_EVENT AND
      RDMA_INC_NUMA AND
-      RDMA_LIB_SXISOCK AND 
+      RDMA_LIB_SXISOCK AND
      RDMA_LIB_XIO AND
      RDMA_LIB_EVENT AND
      RDMA_LIB_EVENT_CORE AND
@@ -53,19 +55,19 @@ if(WITH_RDMA)
      RDMA_LIB_NUMA
      )
-    set(RDMA_INC_DIR 
+    set(RDMA_INC_DIR
-      ${RDMA_INC_SXISOCK} 
+      ${RDMA_INC_SXISOCK}
      ${RDMA_INC_XIO}
      ${RDMA_INC_EVENT}
      ${RDMA_INC_NUMA})
-    set(RDMA_LIBS  
+    set(RDMA_LIBS
-      ${RDMA_LIB_SXISOCK} 
+      ${RDMA_LIB_SXISOCK}
-      ${RDMA_LIB_XIO} 
+      ${RDMA_LIB_XIO}
-      ${RDMA_LIB_EVENT} 
+      ${RDMA_LIB_EVENT}
-      ${RDMA_LIB_EVENT_CORE} 
+      ${RDMA_LIB_EVENT_CORE}
-      ${RDMA_LIB_EVENT_EXTRA} 
+      ${RDMA_LIB_EVENT_EXTRA}
-      ${RDMA_LIB_EVENT_PTHREADS} 
+      ${RDMA_LIB_EVENT_PTHREADS}
-      ${RDMA_LIB_NUMA} 
+      ${RDMA_LIB_NUMA}
      )
    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
    include_directories("${RDMA_INC_DIR}")

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -12,13 +12,13 @@ PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打
 像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
 PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
 行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新
+`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 提供最新
 的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
 内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您
 在国内，请把文档里命令中的paddlepaddle/paddle替换成
 docker.paddlepaddle.org/paddle。
-1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
+1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
@@ -37,13 +37,13 @@ docker.paddlepaddle.org/paddle。
   .. code-block:: bash
-      docker run -it --rm paddlepaddle/paddle:<version>-dev /bin/bash
+      docker run -it --rm paddlepaddle/paddle:0.10.0-dev /bin/bash
   或者，可以以后台进程方式运行容器：
   .. code-block:: bash
-      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
+      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0-dev
   然后用密码 :code:`root` SSH进入容器：
@@ -73,7 +73,7 @@ docker.paddlepaddle.org/paddle。
   .. code-block:: bash
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
+      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
@@ -81,7 +81,7 @@ docker.paddlepaddle.org/paddle。
      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
+      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
 3. 运行以及发布您的AI程序
@@ -98,7 +98,7 @@ docker.paddlepaddle.org/paddle。
      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
   创建和发布自己的AI程序镜像。
 运行PaddlePaddle Book
@@ -177,7 +177,7 @@ Paddle的Docker开发镜像带有一个通过 `woboq code browser
 .. code-block:: bash
-   docker run -d --name paddle-cpu-doc paddle:<version>-dev
+   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -23,7 +23,7 @@ Docker is simple as long as we understand a few basic concepts:
  .. code-block:: bash
-     docker pull paddlepaddle/paddle:0.10.0rc2
+     docker pull paddlepaddle/paddle:0.10.0
  to download a Docker image, paddlepaddle/paddle in this example,
  from Dockerhub.com.
@@ -35,7 +35,7 @@ Docker is simple as long as we understand a few basic concepts:
  .. code-block:: bash
-     docker run paddlepaddle/paddle:0.10.0rc2
+     docker run paddlepaddle/paddle:0.10.0
  to start a container to run a Docker image, paddlepaddle/paddle in this example.
@@ -62,7 +62,7 @@ of PaddlePaddle, we release both of them. Production image includes
 CPU-only version and a CUDA GPU version and their no-AVX versions.
 We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
+<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
 latest versions under "tags" tab at dockerhub.com. If you are in
 China, you can use our Docker image registry mirror to speed up the
 download process. To use it, please replace all paddlepaddle/paddle in
@@ -89,7 +89,7 @@ the commands to docker.paddlepaddle.org/paddle.
   .. code-block:: bash
-      docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
+      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
   Above method work with the GPU image too -- the recommended way is
   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
@@ -101,7 +101,7 @@ the commands to docker.paddlepaddle.org/paddle.
   .. code-block:: bash
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
 2. development image :code:`paddlepaddle/paddle:<version>-dev`
@@ -149,13 +149,13 @@ Run the program using docker:
 .. code-block:: bash
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
+   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
 Or if you are using GPU for training:
 .. code-block:: bash
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
+   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
 Above commands will start a docker container by running :code:`python
 /workspace/example.py`. It will stop once :code:`python
@@ -166,7 +166,7 @@ run PaddlePaddle program interactively:
 .. code-block:: bash
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
+   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
   # now we are inside docker container
   cd /workspace
   python example.py
@@ -175,7 +175,7 @@ Running with GPU is identical:
 .. code-block:: bash
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
   # now we are inside docker container
   cd /workspace
   python example.py

--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -28,17 +28,17 @@ pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API
        seq_pool = pooling(input=layer,
                           pooling_type=pooling.Max(),
-                           agg_level=AggregateLevel.EACH_SEQUENCE)
+                           agg_level=AggregateLevel.TO_SEQUENCE)
 - `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
  - 输入：一个双层序列，或一个单层序列
  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
  - 作用：一个双层序列经过运算变成一个单层序列
  - 输入：必须是一个双层序列
@@ -52,15 +52,15 @@ last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详
 ..	code-block:: bash
        last = last_seq(input=layer,
-                        agg_level=AggregateLevel.EACH_SEQUENCE)
+                        agg_level=AggregateLevel.TO_SEQUENCE)
- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.TO_NO_SEQUENCE` 时（默认值）：
  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
  - 输入：一个双层序列或一个单层序列
  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
- `agg_level=AggregateLevel.EACH_SEQUENCE` 时：
+- `agg_level=AggregateLevel.TO_SEQUENCE` 时：
  - 作用：一个双层序列经过运算变成一个单层序列
  - 输入：必须是一个双层序列
  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
@@ -74,9 +74,9 @@ expand 的使用示例如下，详细见 :ref:`api_v2.layer_expand` 配置API。
        ex = expand(input=layer1,
                    expand_as=layer2,
-                    expand_level=ExpandLevel.FROM_TIMESTEP)
+                    expand_level=ExpandLevel.FROM_NO_SEQUENCE)
- `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
+- `expand_level=ExpandLevel.FROM_NO_SEQUENCE` 时（默认值）：
  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2 可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息

--- a/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
+++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
@@ -81,7 +81,7 @@
  * 在本例中，我们将原始数据的每一组，通过\ :code:`recurrent_group`\ 进行拆解，拆解成的每一句话再通过一个LSTM网络。这和单层RNN的配置是等价的。
-* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.EACH_SEQUENCE`\ 。
+* 与单层RNN的配置类似，我们只需要使用LSTM encode成的最后一个向量。所以对\ :code:`recurrent_group`\ 进行了\ :code:`last_seq`\ 操作。但和单层RNN不同，我们是对每一个子序列取最后一个元素，因此\ :code:`agg_level=AggregateLevel.TO_SEQUENCE`\ 。
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。

--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -14,7 +14,7 @@ import (
 	"github.com/namsral/flag"
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/Paddle/go/recordio"
+	"github.com/PaddlePaddle/recordio"
 )
 func main() {

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -6,7 +6,7 @@ import (
 	"sync"
 	"time"
-	"github.com/PaddlePaddle/Paddle/go/recordio"
+	"github.com/PaddlePaddle/recordio"
 )
 const (

--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -4,5 +4,8 @@ include_directories(${CMAKE_BINARY_DIR})
 add_executable(main main.c)
 add_dependencies(main client)
-set (CMAKE_EXE_LINKER_FLAGS "-pthread")
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
 target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
@@ -17,7 +17,7 @@ retry:
    paddle_parameter param;
    char name_a[] = "param_a";
    char name_b[] = "param_b";
-    char content[] = {0x00, 0x11, 0x22};
+    unsigned char content[] = {0x00, 0x11, 0x22};
    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_a;
    param.content = content;
@@ -39,7 +39,7 @@ retry:
    fail();
  }
-  char content[] = {0x00, 0x11, 0x22};
+  unsigned char content[] = {0x00, 0x11, 0x22};
  paddle_gradient grads[2] = {
      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};

--- a/go/recordio/README.md
+++ b/go/recordio/README.md
-# RecordIO
-## Write
-```go
-f, e := os.Create("a_file.recordio")
-w := recordio.NewWriter(f)
-w.Write([]byte("Hello"))
-w.Write([]byte("World!"))
-w.Close()
-f.Close()
-```
-## Read
-1. Load chunk index:
-   ```go
-   f, e := os.Open("a_file.recordio")
-   idx, e := recordio.LoadIndex(f)
-   fmt.Println("Total records: ", idx.Len())
-   f.Close()
-   ```
-2. Create one or more scanner to read a range of records.  The
-   following example reads the range
-   [1, 3), i.e., the second and the third records:
-   ```go
-   f, e := os.Open("a_file.recordio")
-   s := recrodio.NewScanner(f, idx, 1, 3)
-   for s.Scan() {
-      fmt.Println(string(s.Record()))
-   }
-   if s.Err() != nil {
-      log.Fatalf("Something wrong with scanning: %v", e)
-   }
-   f.Close()
-   ```
--- a/go/recordio/c/CMakeLists.txt
+++ b/go/recordio/c/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
-get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
-get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
-project(cxx_go C Go)
-include(golang)
-include(flags)
-go_library(recordio STATIC)
-add_subdirectory(test)
--- a/go/recordio/c/crecordio.go
+++ b/go/recordio/c/crecordio.go
-package main
-/*
-#include <string.h>
-typedef int reader;
-typedef int writer;
-*/
-import "C"
-import (
-	"log"
-	"os"
-	"strings"
-	"unsafe"
-	"github.com/PaddlePaddle/Paddle/go/recordio"
-)
-var nullPtr = unsafe.Pointer(uintptr(0))
-type writer struct {
-	w *recordio.Writer
-	f *os.File
-}
-type reader struct {
-	scanner *recordio.Scanner
-}
-func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
-		return nil
-	}
-	// create a Go clice backed by a C array, reference:
-	// https://github.com/golang/go/wiki/cgo#turning-c-arrays-into-go-slices
-	//
-	// Go garbage collector will not interact with this data, need
-	// to be freed properly.
-	return (*[1 << 30]byte)(p)[:len:len]
-}
-//export create_recordio_writer
-func create_recordio_writer(path *C.char) C.writer {
-	p := C.GoString(path)
-	f, err := os.Create(p)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-	w := recordio.NewWriter(f, -1, -1)
-	writer := &writer{f: f, w: w}
-	return addWriter(writer)
-}
-//export recordio_write
-func recordio_write(writer C.writer, buf *C.uchar, size C.int) C.int {
-	w := getWriter(writer)
-	b := cArrayToSlice(unsafe.Pointer(buf), int(size))
-	c, err := w.w.Write(b)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-	return C.int(c)
-}
-//export release_recordio_writer
-func release_recordio_writer(writer C.writer) {
-	w := removeWriter(writer)
-	w.w.Close()
-	w.f.Close()
-}
-//export create_recordio_reader
-func create_recordio_reader(path *C.char) C.reader {
-	p := C.GoString(path)
-	s, err := recordio.NewScanner(strings.Split(p, ",")...)
-	if err != nil {
-		log.Println(err)
-		return -1
-	}
-	r := &reader{scanner: s}
-	return addReader(r)
-}
-//export recordio_read
-func recordio_read(reader C.reader, record **C.uchar) C.int {
-	r := getReader(reader)
-	if r.scanner.Scan() {
-		buf := r.scanner.Record()
-		if len(buf) == 0 {
-			*record = (*C.uchar)(nullPtr)
-			return 0
-		}
-		size := C.int(len(buf))
-		*record = (*C.uchar)(C.malloc(C.size_t(len(buf))))
-		C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&buf[0]), C.size_t(len(buf)))
-		return size
-	}
-	return -1
-}
-//export release_recordio_reader
-func release_recordio_reader(reader C.reader) {
-	r := removeReader(reader)
-	r.scanner.Close()
-}
-func main() {} // Required but ignored
--- a/go/recordio/c/register.go
+++ b/go/recordio/c/register.go
-package main
-/*
-typedef int reader;
-typedef int writer;
-*/
-import "C"
-import "sync"
-var mu sync.Mutex
-var handleMap = make(map[C.reader]*reader)
-var curHandle C.reader
-var writerMap = make(map[C.writer]*writer)
-var curWriterHandle C.writer
-func addReader(r *reader) C.reader {
-	mu.Lock()
-	defer mu.Unlock()
-	reader := curHandle
-	curHandle++
-	handleMap[reader] = r
-	return reader
-}
-func getReader(reader C.reader) *reader {
-	mu.Lock()
-	defer mu.Unlock()
-	return handleMap[reader]
-}
-func removeReader(reader C.reader) *reader {
-	mu.Lock()
-	defer mu.Unlock()
-	r := handleMap[reader]
-	delete(handleMap, reader)
-	return r
-}
-func addWriter(w *writer) C.writer {
-	mu.Lock()
-	defer mu.Unlock()
-	writer := curWriterHandle
-	curWriterHandle++
-	writerMap[writer] = w
-	return writer
-}
-func getWriter(writer C.writer) *writer {
-	mu.Lock()
-	defer mu.Unlock()
-	return writerMap[writer]
-}
-func removeWriter(writer C.writer) *writer {
-	mu.Lock()
-	defer mu.Unlock()
-	w := writerMap[writer]
-	delete(writerMap, writer)
-	return w
-}
--- a/go/recordio/c/test/CMakeLists.txt
+++ b/go/recordio/c/test/CMakeLists.txt
-cmake_minimum_required(VERSION 3.0)
-include_directories(${CMAKE_BINARY_DIR})
-add_executable(recordio_test test.c)
-add_dependencies(recordio_test recordio)
-set (CMAKE_EXE_LINKER_FLAGS "-pthread")
-target_link_libraries(recordio_test ${CMAKE_BINARY_DIR}/librecordio.a)
--- a/go/recordio/c/test/test.c
+++ b/go/recordio/c/test/test.c
-#include <stdio.h>
-#include <stdlib.h>
-#include "librecordio.h"
-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
-  exit(-1);
-}
-int main() {
-  writer w = create_recordio_writer("/tmp/test_recordio_0");
-  recordio_write(w, "hello", 6);
-  recordio_write(w, "hi", 3);
-  release_recordio_writer(w);
-  w = create_recordio_writer("/tmp/test_recordio_1");
-  recordio_write(w, "dog", 4);
-  recordio_write(w, "cat", 4);
-  release_recordio_writer(w);
-  reader r = create_recordio_reader("/tmp/test_recordio_*");
-  unsigned char* item = NULL;
-  int size = recordio_read(r, &item);
-  if (strcmp(item, "hello") || size != 6) {
-    fail();
-  }
-  free(item);
-  size = recordio_read(r, &item);
-  if (strcmp(item, "hi") || size != 3) {
-    fail();
-  }
-  free(item);
-  size = recordio_read(r, &item);
-  if (strcmp(item, "dog") || size != 4) {
-    fail();
-  }
-  free(item);
-  size = recordio_read(r, &item);
-  if (strcmp(item, "cat") || size != 4) {
-    fail();
-  }
-  free(item);
-  size = recordio_read(r, &item);
-  if (size != -1) {
-    fail();
-  }
-  release_recordio_reader(r);
-}
--- a/go/recordio/chunk.go
+++ b/go/recordio/chunk.go
-package recordio
-import (
-	"bytes"
-	"compress/gzip"
-	"encoding/binary"
-	"fmt"
-	"hash/crc32"
-	"io"
-	"github.com/golang/snappy"
-)
-// A Chunk contains the Header and optionally compressed records.  To
-// create a chunk, just use ch := &Chunk{}.
-type Chunk struct {
-	records  [][]byte
-	numBytes int // sum of record lengths.
-}
-func (ch *Chunk) add(record []byte) {
-	ch.records = append(ch.records, record)
-	ch.numBytes += len(record)
-}
-// dump the chunk into w, and clears the chunk and makes it ready for
-// the next add invocation.
-func (ch *Chunk) dump(w io.Writer, compressorIndex int) error {
-	// NOTE: don't check ch.numBytes instead, because empty
-	// records are allowed.
-	if len(ch.records) == 0 {
-		return nil
-	}
-	// Write raw records and their lengths into data buffer.
-	var data bytes.Buffer
-	for _, r := range ch.records {
-		var rs [4]byte
-		binary.LittleEndian.PutUint32(rs[:], uint32(len(r)))
-		if _, e := data.Write(rs[:]); e != nil {
-			return fmt.Errorf("Failed to write record length: %v", e)
-		}
-		if _, e := data.Write(r); e != nil {
-			return fmt.Errorf("Failed to write record: %v", e)
-		}
-	}
-	compressed, e := compressData(&data, compressorIndex)
-	if e != nil {
-		return e
-	}
-	// Write chunk header and compressed data.
-	hdr := &Header{
-		checkSum:       crc32.ChecksumIEEE(compressed.Bytes()),
-		compressor:     uint32(compressorIndex),
-		compressedSize: uint32(compressed.Len()),
-		numRecords:     uint32(len(ch.records)),
-	}
-	if _, e := hdr.write(w); e != nil {
-		return fmt.Errorf("Failed to write chunk header: %v", e)
-	}
-	if _, e := w.Write(compressed.Bytes()); e != nil {
-		return fmt.Errorf("Failed to write chunk data: %v", e)
-	}
-	// Clear the current chunk.
-	ch.records = nil
-	ch.numBytes = 0
-	return nil
-}
-type noopCompressor struct {
-	*bytes.Buffer
-}
-func (c *noopCompressor) Close() error {
-	return nil
-}
-func compressData(src io.Reader, compressorIndex int) (*bytes.Buffer, error) {
-	compressed := new(bytes.Buffer)
-	var compressor io.WriteCloser
-	switch compressorIndex {
-	case NoCompression:
-		compressor = &noopCompressor{compressed}
-	case Snappy:
-		compressor = snappy.NewBufferedWriter(compressed)
-	case Gzip:
-		compressor = gzip.NewWriter(compressed)
-	default:
-		return nil, fmt.Errorf("Unknown compression algorithm: %d", compressorIndex)
-	}
-	if _, e := io.Copy(compressor, src); e != nil {
-		return nil, fmt.Errorf("Failed to compress chunk data: %v", e)
-	}
-	compressor.Close()
-	return compressed, nil
-}
-// parse the specified chunk from r.
-func parseChunk(r io.ReadSeeker, chunkOffset int64) (*Chunk, error) {
-	var e error
-	var hdr *Header
-	if _, e = r.Seek(chunkOffset, io.SeekStart); e != nil {
-		return nil, fmt.Errorf("Failed to seek chunk: %v", e)
-	}
-	hdr, e = parseHeader(r)
-	if e != nil {
-		return nil, fmt.Errorf("Failed to parse chunk header: %v", e)
-	}
-	var buf bytes.Buffer
-	if _, e = io.CopyN(&buf, r, int64(hdr.compressedSize)); e != nil {
-		return nil, fmt.Errorf("Failed to read chunk data: %v", e)
-	}
-	if hdr.checkSum != crc32.ChecksumIEEE(buf.Bytes()) {
-		return nil, fmt.Errorf("Checksum checking failed.")
-	}
-	deflated, e := deflateData(&buf, int(hdr.compressor))
-	if e != nil {
-		return nil, e
-	}
-	ch := &Chunk{}
-	for i := 0; i < int(hdr.numRecords); i++ {
-		var rs [4]byte
-		if _, e = deflated.Read(rs[:]); e != nil {
-			return nil, fmt.Errorf("Failed to read record length: %v", e)
-		}
-		r := make([]byte, binary.LittleEndian.Uint32(rs[:]))
-		if _, e = deflated.Read(r); e != nil {
-			return nil, fmt.Errorf("Failed to read a record: %v", e)
-		}
-		ch.records = append(ch.records, r)
-		ch.numBytes += len(r)
-	}
-	return ch, nil
-}
-func deflateData(src io.Reader, compressorIndex int) (*bytes.Buffer, error) {
-	var e error
-	var deflator io.Reader
-	switch compressorIndex {
-	case NoCompression:
-		deflator = src
-	case Snappy:
-		deflator = snappy.NewReader(src)
-	case Gzip:
-		deflator, e = gzip.NewReader(src)
-		if e != nil {
-			return nil, fmt.Errorf("Failed to create gzip reader: %v", e)
-		}
-	default:
-		return nil, fmt.Errorf("Unknown compression algorithm: %d", compressorIndex)
-	}
-	deflated := new(bytes.Buffer)
-	if _, e = io.Copy(deflated, deflator); e != nil {
-		return nil, fmt.Errorf("Failed to deflate chunk data: %v", e)
-	}
-	return deflated, nil
-}
--- a/go/recordio/header.go
+++ b/go/recordio/header.go
-package recordio
-import (
-	"encoding/binary"
-	"fmt"
-	"io"
-)
-const (
-	// NoCompression means writing raw chunk data into files.
-	// With other choices, chunks are compressed before written.
-	NoCompression = iota
-	// Snappy had been the default compressing algorithm widely
-	// used in Google.  It compromises between speech and
-	// compression ratio.
-	Snappy
-	// Gzip is a well-known compression algorithm.  It is
-	// recommmended only you are looking for compression ratio.
-	Gzip
-	magicNumber       uint32 = 0x01020304
-	defaultCompressor        = Snappy
-)
-// Header is the metadata of Chunk.
-type Header struct {
-	checkSum       uint32
-	compressor     uint32
-	compressedSize uint32
-	numRecords     uint32
-}
-func (c *Header) write(w io.Writer) (int, error) {
-	var buf [20]byte
-	binary.LittleEndian.PutUint32(buf[0:4], magicNumber)
-	binary.LittleEndian.PutUint32(buf[4:8], c.checkSum)
-	binary.LittleEndian.PutUint32(buf[8:12], c.compressor)
-	binary.LittleEndian.PutUint32(buf[12:16], c.compressedSize)
-	binary.LittleEndian.PutUint32(buf[16:20], c.numRecords)
-	return w.Write(buf[:])
-}
-func parseHeader(r io.Reader) (*Header, error) {
-	var buf [20]byte
-	if _, e := r.Read(buf[:]); e != nil {
-		return nil, e
-	}
-	if v := binary.LittleEndian.Uint32(buf[0:4]); v != magicNumber {
-		return nil, fmt.Errorf("Failed to parse magic number")
-	}
-	return &Header{
-		checkSum:       binary.LittleEndian.Uint32(buf[4:8]),
-		compressor:     binary.LittleEndian.Uint32(buf[8:12]),
-		compressedSize: binary.LittleEndian.Uint32(buf[12:16]),
-		numRecords:     binary.LittleEndian.Uint32(buf[16:20]),
-	}, nil
-}
--- a/go/recordio/range_scanner.go
+++ b/go/recordio/range_scanner.go
-package recordio
-import "io"
-// Index consists offsets and sizes of the consequetive chunks in a RecordIO file.
-type Index struct {
-	chunkOffsets []int64
-	chunkLens    []uint32
-	numRecords   int   // the number of all records in a file.
-	chunkRecords []int // the number of records in chunks.
-}
-// LoadIndex scans the file and parse chunkOffsets, chunkLens, and len.
-func LoadIndex(r io.ReadSeeker) (*Index, error) {
-	f := &Index{}
-	offset := int64(0)
-	var e error
-	var hdr *Header
-	for {
-		hdr, e = parseHeader(r)
-		if e != nil {
-			break
-		}
-		f.chunkOffsets = append(f.chunkOffsets, offset)
-		f.chunkLens = append(f.chunkLens, hdr.numRecords)
-		f.chunkRecords = append(f.chunkRecords, int(hdr.numRecords))
-		f.numRecords += int(hdr.numRecords)
-		offset, e = r.Seek(int64(hdr.compressedSize), io.SeekCurrent)
-		if e != nil {
-			break
-		}
-	}
-	if e == io.EOF {
-		return f, nil
-	}
-	return nil, e
-}
-// NumRecords returns the total number of records in a RecordIO file.
-func (r *Index) NumRecords() int {
-	return r.numRecords
-}
-// NumChunks returns the total number of chunks in a RecordIO file.
-func (r *Index) NumChunks() int {
-	return len(r.chunkLens)
-}
-// ChunkIndex return the Index of i-th Chunk.
-func (r *Index) ChunkIndex(i int) *Index {
-	idx := &Index{}
-	idx.chunkOffsets = []int64{r.chunkOffsets[i]}
-	idx.chunkLens = []uint32{r.chunkLens[i]}
-	idx.chunkRecords = []int{r.chunkRecords[i]}
-	idx.numRecords = idx.chunkRecords[0]
-	return idx
-}
-// Locate returns the index of chunk that contains the given record,
-// and the record index within the chunk.  It returns (-1, -1) if the
-// record is out of range.
-func (r *Index) Locate(recordIndex int) (int, int) {
-	sum := 0
-	for i, l := range r.chunkLens {
-		sum += int(l)
-		if recordIndex < sum {
-			return i, recordIndex - sum + int(l)
-		}
-	}
-	return -1, -1
-}
-// RangeScanner scans records in a specified range within [0, numRecords).
-type RangeScanner struct {
-	reader          io.ReadSeeker
-	index           *Index
-	start, end, cur int
-	chunkIndex      int
-	chunk           *Chunk
-	err             error
-}
-// NewRangeScanner creates a scanner that sequencially reads records in the
-// range [start, start+len).  If start < 0, it scans from the
-// beginning.  If len < 0, it scans till the end of file.
-func NewRangeScanner(r io.ReadSeeker, index *Index, start, len int) *RangeScanner {
-	if start < 0 {
-		start = 0
-	}
-	if len < 0 || start+len >= index.NumRecords() {
-		len = index.NumRecords() - start
-	}
-	return &RangeScanner{
-		reader:     r,
-		index:      index,
-		start:      start,
-		end:        start + len,
-		cur:        start - 1, // The intial status required by Scan.
-		chunkIndex: -1,
-		chunk:      &Chunk{},
-	}
-}
-// Scan moves the cursor forward for one record and loads the chunk
-// containing the record if not yet.
-func (s *RangeScanner) Scan() bool {
-	s.cur++
-	if s.cur >= s.end {
-		s.err = io.EOF
-	} else {
-		if ci, _ := s.index.Locate(s.cur); s.chunkIndex != ci {
-			s.chunkIndex = ci
-			s.chunk, s.err = parseChunk(s.reader, s.index.chunkOffsets[ci])
-		}
-	}
-	return s.err == nil
-}
-// Record returns the record under the current cursor.
-func (s *RangeScanner) Record() []byte {
-	_, ri := s.index.Locate(s.cur)
-	return s.chunk.records[ri]
-}
-// Err returns the first non-EOF error that was encountered by the
-// Scanner.
-func (s *RangeScanner) Err() error {
-	if s.err == io.EOF {
-		return nil
-	}
-	return s.err
-}
--- a/go/recordio/recordio_internal_test.go
+++ b/go/recordio/recordio_internal_test.go
-package recordio
-import (
-	"bytes"
-	"testing"
-	"unsafe"
-	"github.com/stretchr/testify/assert"
-)
-func TestChunkHead(t *testing.T) {
-	assert := assert.New(t)
-	c := &Header{
-		checkSum:       123,
-		compressor:     456,
-		compressedSize: 789,
-	}
-	var buf bytes.Buffer
-	_, e := c.write(&buf)
-	assert.Nil(e)
-	cc, e := parseHeader(&buf)
-	assert.Nil(e)
-	assert.Equal(c, cc)
-}
-func TestWriteAndRead(t *testing.T) {
-	assert := assert.New(t)
-	data := []string{
-		"12345",
-		"1234",
-		"12"}
-	var buf bytes.Buffer
-	w := NewWriter(&buf, 10, NoCompression) // use a small maxChunkSize.
-	n, e := w.Write([]byte(data[0])) // not exceed chunk size.
-	assert.Nil(e)
-	assert.Equal(5, n)
-	n, e = w.Write([]byte(data[1])) // not exceed chunk size.
-	assert.Nil(e)
-	assert.Equal(4, n)
-	n, e = w.Write([]byte(data[2])) // exeeds chunk size, dump and create a new chunk.
-	assert.Nil(e)
-	assert.Equal(n, 2)
-	assert.Nil(w.Close()) // flush the second chunk.
-	assert.Nil(w.Writer)
-	n, e = w.Write([]byte("anything")) // not effective after close.
-	assert.NotNil(e)
-	assert.Equal(n, 0)
-	idx, e := LoadIndex(bytes.NewReader(buf.Bytes()))
-	assert.Nil(e)
-	assert.Equal([]uint32{2, 1}, idx.chunkLens)
-	assert.Equal(
-		[]int64{0,
-			int64(4 + // magic number
-				unsafe.Sizeof(Header{}) +
-				5 + // first record
-				4 + // second record
-				2*4)}, // two record legnths
-		idx.chunkOffsets)
-	s := NewRangeScanner(bytes.NewReader(buf.Bytes()), idx, -1, -1)
-	i := 0
-	for s.Scan() {
-		assert.Equal(data[i], string(s.Record()))
-		i++
-	}
-}
-func TestWriteEmptyFile(t *testing.T) {
-	assert := assert.New(t)
-	var buf bytes.Buffer
-	w := NewWriter(&buf, 10, NoCompression) // use a small maxChunkSize.
-	assert.Nil(w.Close())
-	assert.Equal(0, buf.Len())
-	idx, e := LoadIndex(bytes.NewReader(buf.Bytes()))
-	assert.Nil(e)
-	assert.Equal(0, idx.NumRecords())
-}
--- a/go/recordio/recordio_test.go
+++ b/go/recordio/recordio_test.go
-package recordio_test
-import (
-	"bytes"
-	"reflect"
-	"testing"
-	"github.com/PaddlePaddle/Paddle/go/recordio"
-)
-func TestWriteRead(t *testing.T) {
-	const total = 1000
-	var buf bytes.Buffer
-	w := recordio.NewWriter(&buf, 0, -1)
-	for i := 0; i < total; i++ {
-		_, err := w.Write(make([]byte, i))
-		if err != nil {
-			t.Fatal(err)
-		}
-	}
-	w.Close()
-	idx, err := recordio.LoadIndex(bytes.NewReader(buf.Bytes()))
-	if err != nil {
-		t.Fatal(err)
-	}
-	if idx.NumRecords() != total {
-		t.Fatal("num record does not match:", idx.NumRecords(), total)
-	}
-	s := recordio.NewRangeScanner(bytes.NewReader(buf.Bytes()), idx, -1, -1)
-	i := 0
-	for s.Scan() {
-		if !reflect.DeepEqual(s.Record(), make([]byte, i)) {
-			t.Fatal("not equal:", len(s.Record()), len(make([]byte, i)))
-		}
-		i++
-	}
-	if i != total {
-		t.Fatal("total count not match:", i, total)
-	}
-}
-func TestChunkIndex(t *testing.T) {
-	const total = 1000
-	var buf bytes.Buffer
-	w := recordio.NewWriter(&buf, 0, -1)
-	for i := 0; i < total; i++ {
-		_, err := w.Write(make([]byte, i))
-		if err != nil {
-			t.Fatal(err)
-		}
-	}
-	w.Close()
-	idx, err := recordio.LoadIndex(bytes.NewReader(buf.Bytes()))
-	if err != nil {
-		t.Fatal(err)
-	}
-	if idx.NumChunks() != total {
-		t.Fatal("unexpected chunk num:", idx.NumChunks(), total)
-	}
-	for i := 0; i < total; i++ {
-		newIdx := idx.ChunkIndex(i)
-		s := recordio.NewRangeScanner(bytes.NewReader(buf.Bytes()), newIdx, -1, -1)
-		j := 0
-		for s.Scan() {
-			if !reflect.DeepEqual(s.Record(), make([]byte, i)) {
-				t.Fatal("not equal:", len(s.Record()), len(make([]byte, i)))
-			}
-			j++
-		}
-		if j != 1 {
-			t.Fatal("unexpected record per chunk:", j)
-		}
-	}
-}
--- a/go/recordio/scanner.go
+++ b/go/recordio/scanner.go
-package recordio
-import (
-	"fmt"
-	"os"
-	"path/filepath"
-)
-// Scanner is a scanner for multiple recordio files.
-type Scanner struct {
-	paths      []string
-	curFile    *os.File
-	curScanner *RangeScanner
-	pathIdx    int
-	end        bool
-	err        error
-}
-// NewScanner creates a new Scanner.
-func NewScanner(paths ...string) (*Scanner, error) {
-	var ps []string
-	for _, s := range paths {
-		match, err := filepath.Glob(s)
-		if err != nil {
-			return nil, err
-		}
-		ps = append(ps, match...)
-	}
-	if len(ps) == 0 {
-		return nil, fmt.Errorf("no valid path provided: %v", paths)
-	}
-	return &Scanner{paths: ps}, nil
-}
-// Scan moves the cursor forward for one record and loads the chunk
-// containing the record if not yet.
-func (s *Scanner) Scan() bool {
-	if s.err != nil {
-		return false
-	}
-	if s.end {
-		return false
-	}
-	if s.curScanner == nil {
-		more, err := s.nextFile()
-		if err != nil {
-			s.err = err
-			return false
-		}
-		if !more {
-			s.end = true
-			return false
-		}
-	}
-	curMore := s.curScanner.Scan()
-	s.err = s.curScanner.Err()
-	if s.err != nil {
-		return curMore
-	}
-	if !curMore {
-		err := s.curFile.Close()
-		if err != nil {
-			s.err = err
-			return false
-		}
-		s.curFile = nil
-		more, err := s.nextFile()
-		if err != nil {
-			s.err = err
-			return false
-		}
-		if !more {
-			s.end = true
-			return false
-		}
-		return s.Scan()
-	}
-	return true
-}
-// Err returns the first non-EOF error that was encountered by the
-// Scanner.
-func (s *Scanner) Err() error {
-	return s.err
-}
-// Record returns the record under the current cursor.
-func (s *Scanner) Record() []byte {
-	if s.curScanner == nil {
-		return nil
-	}
-	return s.curScanner.Record()
-}
-// Close release the resources.
-func (s *Scanner) Close() error {
-	s.curScanner = nil
-	if s.curFile != nil {
-		err := s.curFile.Close()
-		s.curFile = nil
-		return err
-	}
-	return nil
-}
-func (s *Scanner) nextFile() (bool, error) {
-	if s.pathIdx >= len(s.paths) {
-		return false, nil
-	}
-	path := s.paths[s.pathIdx]
-	s.pathIdx++
-	f, err := os.Open(path)
-	if err != nil {
-		return false, err
-	}
-	idx, err := LoadIndex(f)
-	if err != nil {
-		f.Close()
-		return false, err
-	}
-	s.curFile = f
-	s.curScanner = NewRangeScanner(f, idx, 0, -1)
-	return true, nil
-}
--- a/go/recordio/writer.go
+++ b/go/recordio/writer.go
-package recordio
-import (
-	"fmt"
-	"io"
-)
-const (
-	defaultMaxChunkSize = 32 * 1024 * 1024
-)
-// Writer creates a RecordIO file.
-type Writer struct {
-	io.Writer    // Set to nil to mark a closed writer.
-	chunk        *Chunk
-	maxChunkSize int // total records size, excluding metadata, before compression.
-	compressor   int
-}
-// NewWriter creates a RecordIO file writer.  Each chunk is compressed
-// using the deflate algorithm given compression level.  Note that
-// level 0 means no compression and -1 means default compression.
-func NewWriter(w io.Writer, maxChunkSize, compressor int) *Writer {
-	if maxChunkSize < 0 {
-		maxChunkSize = defaultMaxChunkSize
-	}
-	if compressor < 0 {
-		compressor = defaultCompressor
-	}
-	return &Writer{
-		Writer:       w,
-		chunk:        &Chunk{},
-		maxChunkSize: maxChunkSize,
-		compressor:   compressor}
-}
-// Writes a record.  It returns an error if Close has been called.
-func (w *Writer) Write(record []byte) (int, error) {
-	if w.Writer == nil {
-		return 0, fmt.Errorf("Cannot write since writer had been closed")
-	}
-	if w.chunk.numBytes+len(record) > w.maxChunkSize {
-		if e := w.chunk.dump(w.Writer, w.compressor); e != nil {
-			return 0, e
-		}
-	}
-	w.chunk.add(record)
-	return len(record), nil
-}
-// Close flushes the current chunk and makes the writer invalid.
-func (w *Writer) Close() error {
-	e := w.chunk.dump(w.Writer, w.compressor)
-	w.Writer = nil
-	return e
-}
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -217,10 +217,10 @@ void SmoothL1CostLayer::forwardImp(Matrix& output,
    targetCpu->copyFrom(target);
    outputCpu->copyFrom(output);
    labelCpu->copyFrom(*label.value);
-    targetCpu->smoothL1(*outputCpu, *labelCpu);
+    targetCpu->smoothL1(*outputCpu, *labelCpu, 1.0);
    target.copyFrom(*targetCpu);
  } else {
-    target.smoothL1(output, *label.value);
+    target.smoothL1(output, *label.value, 1.0);
  }
 }
@@ -238,10 +238,10 @@ void SmoothL1CostLayer::backwardImp(Matrix& output,
    outputGCpu->copyFrom(outputG);
    outputCpu->copyFrom(output);
    labelCpu->copyFrom(*label.value);
-    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu, 1.0);
    outputG.copyFrom(*outputGCpu);
  } else {
-    outputG.smoothL1Bp(output, *label.value);
+    outputG.smoothL1Bp(output, *label.value, 1.0);
  }
 }

--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -59,7 +59,7 @@ lstm_nest_group = recurrent_group(
    input=SubsequenceInput(emb_group), step=lstm_group, name="lstm_nest_group")
 # hasSubseq ->(seqlastins) seq
 lstm_last = last_seq(
-    input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+    input=lstm_nest_group, agg_level=AggregateLevel.TO_SEQUENCE)
 # seq ->(expand) hasSubseq
 lstm_expand = expand_layer(
@@ -71,7 +71,7 @@ lstm_expand = expand_layer(
 lstm_average = pooling_layer(
    input=lstm_expand,
    pooling_type=AvgPooling(),
-    agg_level=AggregateLevel.EACH_SEQUENCE)
+    agg_level=AggregateLevel.TO_SEQUENCE)
 with mixed_layer(
        size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:

--- a/paddle/majel/CMakeLists.txt
+++ b/paddle/majel/CMakeLists.txt
 cc_library(place SRCS place.cc)
+cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
 cc_library(ddim SRCS ddim.cc)
+cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
-if(WITH_TESTING)
+nv_test(cuda_test SRCS cuda_test.cu)
-    add_subdirectory(test)
+nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-endif()
--- a/paddle/majel/test/cuda_test.cu
+++ b/paddle/majel/test/cuda_test.cu
--- a/paddle/majel/test/ddim_test.cc
+++ b/paddle/majel/test/ddim_test.cc
--- a/paddle/majel/test/dim_test.cu
+++ b/paddle/majel/test/dim_test.cu
--- a/paddle/majel/test/place_test.cc
+++ b/paddle/majel/test/place_test.cc
 #include "paddle/majel/place.h"
 #include <sstream>
 #include "gtest/gtest.h"
-#include "paddle/utils/Logging.h"
 TEST(Place, Equality) {
  majel::CpuPlace cpu;
@@ -38,5 +37,4 @@ TEST(Place, Print) {
    ss << majel::CpuPlace();
    EXPECT_EQ("CpuPlace", ss.str());
  }
-  LOG(INFO) << "\n[----------] Done \n";
 }
--- a/paddle/majel/test/CMakeLists.txt
+++ b/paddle/majel/test/CMakeLists.txt
-cc_test(place_test
-    SRCS place_test.cc
-    DEPS place)
-cc_test(ddim_test
-    SRCS ddim_test.cc
-    DEPS ddim)
-if(WITH_GPU)
-    nv_test(cuda_test SRCS cuda_test.cu)
-    nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-endif()
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3606,7 +3606,7 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
  }
 }
-void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label, real destScale) {
  CHECK(output.useGpu_ == false && label.useGpu_ == false)
      << "Matrix type are not equal";
@@ -3624,6 +3624,7 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
    for (size_t j = 0; j < dim; ++j) {
      real absVal = std::fabs(out[j] - lbl[j]);
+      cost[i] *= destScale;
      if (absVal < 1.0)
        cost[i] += 0.5 * absVal * absVal;
      else
@@ -3632,7 +3633,7 @@ void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
  }
 }
-void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label, real destScale) {
  CHECK(output.useGpu_ == false && label.useGpu_ == false)
      << "Matrix type are not equal";
@@ -3650,6 +3651,7 @@ void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
    for (size_t j = 0; j < dim; ++j) {
      real val = out[j] - lbl[j];
+      grad[j] *= destScale;
      if (std::fabs(val) < 1) {
        grad[j] += val;
      } else {

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -789,11 +789,11 @@ public:
    LOG(FATAL) << "Not implemented";
  }
-  virtual void smoothL1(Matrix& output, Matrix& label) {
+  virtual void smoothL1(Matrix& output, Matrix& label, real destScale) {
    LOG(FATAL) << "Not implemented";
  }
-  virtual void smoothL1Bp(Matrix& outputV, Matrix& label) {
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label, real destScale) {
    LOG(FATAL) << "Not implemented";
  }
@@ -1736,8 +1736,8 @@ public:
  /// gradient of sumOfSquares.
  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
-  void smoothL1(Matrix& output, Matrix& label);
+  void smoothL1(Matrix& output, Matrix& label, real destScale);
-  void smoothL1Bp(Matrix& output, Matrix& label);
+  void smoothL1Bp(Matrix& output, Matrix& label, real destScale);
  void tanh(Matrix& output);
  void tanhDerivative(Matrix& output);

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -21,6 +21,8 @@ cd /paddle/build
 # build script will not fail if *.deb does not exist
 rm *.deb 2>/dev/null || true
+# delete previous built whl packages
+rm -rf /paddle/paddle/dist 2>/dev/null || true
 cat <<EOF
 ========================================

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -237,16 +237,19 @@ class AggregateLevel(object):
    Accordingly, AggregateLevel supports two modes:
-    - :code:`AggregateLevel.EACH_TIMESTEP` means the aggregation acts on each
+    - :code:`AggregateLevel.TO_NO_SEQUENCE` means the aggregation acts on each
      timestep of a sequence, both :code:`SUB_SEQUENCE` and :code:`SEQUENCE` will
      be aggregated to :code:`NO_SEQUENCE`.
-    - :code:`AggregateLevel.EACH_SEQUENCE` means the aggregation acts on each
+    - :code:`AggregateLevel.TO_SEQUENCE` means the aggregation acts on each
      sequence of a nested sequence, :code:`SUB_SEQUENCE` will be aggregated to
      :code:`SEQUENCE`.
    """
-    EACH_TIMESTEP = 'non-seq'
+    TO_NO_SEQUENCE = 'non-seq'
-    EACH_SEQUENCE = 'seq'
+    TO_SEQUENCE = 'seq'
+    # compatible with previous configuration
+    EACH_TIMESTEP = TO_NO_SEQUENCE
+    EACH_SEQUENCE = TO_SEQUENCE
 class LayerOutput(object):
@@ -482,7 +485,7 @@ def table_projection(input, size=0, param_attr=None):
    return proj
-def identity_projection(input, offset=None):
+def identity_projection(input, offset=None, size=None):
    """
    1. IdentityProjection if offset=None. It performs:
@@ -523,8 +526,10 @@ def identity_projection(input, offset=None):
        proj = IdentityProjection(input_layer_name=input.name)
        proj.origin = input
    else:
+        if size is None:
+            size = input.size - offset
        proj = IdentityOffsetProjection(
-            input_layer_name=input.name, offset=offset)
+            input_layer_name=input.name, offset=offset, size=size)
        proj.origin = input
    return proj
@@ -1081,7 +1086,7 @@ def pooling_layer(input,
                  pooling_type=None,
                  name=None,
                  bias_attr=None,
-                  agg_level=AggregateLevel.EACH_TIMESTEP,
+                  agg_level=AggregateLevel.TO_NO_SEQUENCE,
                  layer_attr=None):
    """
    Pooling layer for sequence inputs, not used for Image.
@@ -1092,10 +1097,10 @@ def pooling_layer(input,
       seq_pool = pooling_layer(input=layer,
                                pooling_type=AvgPooling(),
-                                agg_level=AggregateLevel.EACH_SEQUENCE)
+                                agg_level=AggregateLevel.TO_NO_SEQUENCE)
-    :param agg_level: AggregateLevel.EACH_TIMESTEP or
+    :param agg_level: AggregateLevel.TO_NO_SEQUENCE or
-                      AggregateLevel.EACH_SEQUENCE
+                      AggregateLevel.TO_SEQUENCE
    :type agg_level: AggregateLevel
    :param name: layer name.
    :type name: basestring
@@ -1365,7 +1370,7 @@ def grumemory(input,
 @layer_support()
 def last_seq(input,
             name=None,
-             agg_level=AggregateLevel.EACH_TIMESTEP,
+             agg_level=AggregateLevel.TO_NO_SEQUENCE,
             stride=-1,
             layer_attr=None):
    """
@@ -1400,7 +1405,7 @@ def last_seq(input,
                       " series information at all. Maybe you want to use"
                       " first_seq instead.")
-    if agg_level == AggregateLevel.EACH_SEQUENCE:
+    if agg_level == AggregateLevel.TO_SEQUENCE:
        assert stride == -1
    Layer(
@@ -1421,7 +1426,7 @@ def last_seq(input,
 @layer_support()
 def first_seq(input,
              name=None,
-              agg_level=AggregateLevel.EACH_TIMESTEP,
+              agg_level=AggregateLevel.TO_NO_SEQUENCE,
              stride=-1,
              layer_attr=None):
    """
@@ -1457,7 +1462,7 @@ def first_seq(input,
                       ' time series information at all. Maybe you want to use'
                       ' last_seq instead.')
-    if agg_level == AggregateLevel.EACH_SEQUENCE:
+    if agg_level == AggregateLevel.TO_SEQUENCE:
        assert stride == -1
    Layer(
@@ -1480,16 +1485,18 @@ class ExpandLevel(object):
    ExpandLevel supports two modes:
-    - :code:`ExpandLevel.FROM_TIMESTEP` means the expandation acts on each
+    - :code:`ExpandLevel.FROM_NO_SEQUENCE` means the expansion acts on
-      timestep of a sequence, :code:`NO_SEQUENCE` will be expanded to
+      :code:`NO_SEQUENCE`, which will be expanded to
      :code:`SEQUENCE` or :code:`SUB_SEQUENCE`.
-    - :code:`ExpandLevel.FROM_SEQUENCE` means the expandation acts on each
+    - :code:`ExpandLevel.FROM_SEQUENCE` means the expansion acts on
-      sequence of a nested sequence, :code:`SEQUENCE` will be expanded to
+      :code:`SEQUENCE`, which will be expanded to
      :code:`SUB_SEQUENCE`.
    """
-    FROM_TIMESTEP = AggregateLevel.EACH_TIMESTEP
+    FROM_NO_SEQUENCE = AggregateLevel.TO_NO_SEQUENCE
-    FROM_SEQUENCE = AggregateLevel.EACH_SEQUENCE
+    FROM_SEQUENCE = AggregateLevel.TO_SEQUENCE
+    # compatible with previous configuration
+    FROM_TIMESTEP = FROM_NO_SEQUENCE
 @wrap_name_default()
@@ -1498,7 +1505,7 @@ def expand_layer(input,
                 expand_as,
                 name=None,
                 bias_attr=False,
-                 expand_level=ExpandLevel.FROM_TIMESTEP,
+                 expand_level=ExpandLevel.FROM_NO_SEQUENCE,
                 layer_attr=None):
    """
    A layer for "Expand Dense data or (sequence data where the length of each
@@ -1510,7 +1517,7 @@ def expand_layer(input,
       expand = expand_layer(input=layer1,
                             expand_as=layer2,
-                             expand_level=ExpandLevel.FROM_TIMESTEP)
+                             expand_level=ExpandLevel.FROM_NO_SEQUENCE)
    :param input: Input layer
    :type input: LayerOutput
@@ -2797,7 +2804,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
    if layer_type == LayerType.CONCAT_LAYER:
        assert not bias_attr
-    Layer(
+    layer = Layer(
        name=name,
        type=layer_type,
        inputs=[x.name for x in input] if is_concat_layer else input,
@@ -2805,13 +2812,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
        bias=ParamAttr.to_bias(bias_attr),
        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    sz = 0
+    sz = layer.config.size
-    for each_input in input:
-        if each_input.size is not None:
-            sz += each_input.size
-        else:
-            sz = None
-            break
    return LayerOutput(
        name,
@@ -2979,7 +2980,7 @@ def memory(name,
 @layer_support()
 def lstm_step_layer(input,
                    state,
-                    size,
+                    size=None,
                    act=None,
                    name=None,
                    gate_act=None,
@@ -3045,6 +3046,9 @@ def lstm_step_layer(input,
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
+    assert size is None or state.size == size
+    size = state.size
    Layer(
        name=name,
        type=LayerType.LSTM_STEP_LAYER,
@@ -3052,7 +3056,7 @@ def lstm_step_layer(input,
        active_gate_type=gate_act.name,
        active_state_type=state_act.name,
        bias=ParamAttr.to_bias(bias_attr),
-        size=size,
+        size=state.size,
        inputs=[input.name, state.name],
        **ExtraLayerAttribute.to_kwargs(layer_attr))

--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -6,7 +6,7 @@ din = data_layer(name='data', size=30)
 seq_op = [first_seq, last_seq]
-agg_level = [AggregateLevel.EACH_SEQUENCE, AggregateLevel.EACH_TIMESTEP]
+agg_level = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
 opts = []
@@ -15,6 +15,7 @@ for op in seq_op:
        opts.append(op(input=din, agg_level=al))
 for op in seq_op:
-    opts.append(op(input=din, agg_level=AggregateLevel.EACH_TIMESTEP, stride=5))
+    opts.append(
+        op(input=din, agg_level=AggregateLevel.TO_NO_SEQUENCE, stride=5))
 outputs(opts)
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
@@ -9,4 +9,6 @@ outputs(
    expand_layer(
        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_SEQUENCE),
    expand_layer(
-        input=din, expand_as=data_seq, expand_level=ExpandLevel.FROM_TIMESTEP))
+        input=din,
+        expand_as=data_seq,
+        expand_level=ExpandLevel.FROM_NO_SEQUENCE))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
@@ -6,7 +6,7 @@ din = data_layer(name='dat_in', size=100)
 POOL_TYPE = [MaxPooling, AvgPooling, SumPooling]
-AGG_LEVEL = [AggregateLevel.EACH_SEQUENCE, AggregateLevel.EACH_TIMESTEP]
+AGG_LEVEL = [AggregateLevel.TO_SEQUENCE, AggregateLevel.TO_NO_SEQUENCE]
 opts = []

--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
@@ -39,6 +39,10 @@ def make_layer_label(layer_config):
 def make_diagram(config_file, dot_file, config_arg_str):
    config = parse_config(config_file, config_arg_str)
+    make_diagram_from_proto(config.model_config, dot_file)
+def make_diagram_from_proto(model_config, dot_file):
    # print >> sys.stderr, config
    name2id = {}
    f = open(dot_file, 'w')
@@ -59,12 +63,12 @@ def make_diagram(config_file, dot_file, config_arg_str):
    print >> f, 'digraph graphname {'
    print >> f, 'node [width=0.375,height=0.25];'
-    for i in xrange(len(config.model_config.layers)):
+    for i in xrange(len(model_config.layers)):
-        l = config.model_config.layers[i]
+        l = model_config.layers[i]
        name2id[l.name] = i
    i = 0
-    for sub_model in config.model_config.sub_models:
+    for sub_model in model_config.sub_models:
        if sub_model.name == 'root':
            continue
        print >> f, 'subgraph cluster_%s {' % i
@@ -78,18 +82,18 @@ def make_diagram(config_file, dot_file, config_arg_str):
        for layer_name in sub_model.layer_names:
            submodel_layers.add(layer_name)
            lid = name2id[layer_name]
-            layer_config = config.model_config.layers[lid]
+            layer_config = model_config.layers[lid]
            label = make_layer_label(layer_config)
            print >> f, 'l%s [label="%s", shape=box];' % (lid, label)
        print >> f, '}'
-    for i in xrange(len(config.model_config.layers)):
+    for i in xrange(len(model_config.layers)):
-        l = config.model_config.layers[i]
+        l = model_config.layers[i]
        if l.name not in submodel_layers:
            label = make_layer_label(l)
            print >> f, 'l%s [label="%s", shape=box];' % (i, label)
-    for sub_model in config.model_config.sub_models:
+    for sub_model in model_config.sub_models:
        if sub_model.name == 'root':
            continue
        for link in sub_model.in_links:
@@ -99,8 +103,8 @@ def make_diagram(config_file, dot_file, config_arg_str):
        for mem in sub_model.memories:
            print >> f, make_mem(mem)
-    for i in xrange(len(config.model_config.layers)):
+    for i in xrange(len(model_config.layers)):
-        for l in config.model_config.layers[i].inputs:
+        for l in model_config.layers[i].inputs:
            print >> f, 'l%s -> l%s [label="%s"];' % (
                name2id[l.input_layer_name], i, l.input_parameter_name)

--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -19,8 +19,10 @@ import shutil
 import sys
 import importlib
 import paddle.v2.dataset
+import cPickle
+import glob
-__all__ = ['DATA_HOME', 'download', 'md5file']
+__all__ = ['DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader']
 DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
@@ -74,3 +76,76 @@ def fetch_all():
            getattr(
                importlib.import_module("paddle.v2.dataset.%s" % module_name),
                "fetch")()
+def split(reader, line_count, suffix="%05d.pickle", dumper=cPickle.dump):
+    """
+    you can call the function as:
+    split(paddle.v2.dataset.cifar.train10(), line_count=1000,
+        suffix="imikolov-train-%05d.pickle")
+    the output files as:
+    |-imikolov-train-00000.pickle
+    |-imikolov-train-00001.pickle
+    |- ...
+    |-imikolov-train-00480.pickle
+    :param reader: is a reader creator
+    :param line_count: line count for each file
+    :param suffix: the suffix for the output files, should contain "%d"
+                means the id for each file. Default is "%05d.pickle"
+    :param dumper: is a callable function that dump object to file, this
+                function will be called as dumper(obj, f) and obj is the object
+                will be dumped, f is a file object. Default is cPickle.dump.
+    """
+    if not callable(dumper):
+        raise TypeError("dumper should be callable.")
+    lines = []
+    indx_f = 0
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i >= line_count and i % line_count == 0:
+            with open(suffix % indx_f, "w") as f:
+                dumper(lines, f)
+                lines = []
+                indx_f += 1
+    if lines:
+        with open(suffix % indx_f, "w") as f:
+            dumper(lines, f)
+def cluster_files_reader(files_pattern,
+                         trainer_count,
+                         trainer_id,
+                         loader=cPickle.load):
+    """
+    Create a reader that yield element from the given files, select
+    a file set according trainer count and trainer_id
+    :param files_pattern: the files which generating by split(...)
+    :param trainer_count: total trainer count
+    :param trainer_id: the trainer rank id
+    :param loader: is a callable function that load object from file, this
+                function will be called as loader(f) and f is a file object.
+                Default is cPickle.load
+    """
+    def reader():
+        if not callable(loader):
+            raise TypeError("loader should be callable.")
+        file_list = glob.glob(files_pattern)
+        file_list.sort()
+        my_file_list = []
+        for idx, fn in enumerate(file_list):
+            if idx % trainer_count == trainer_id:
+                print "append file: %s" % fn
+                my_file_list.append(fn)
+        for fn in my_file_list:
+            with open(fn, "r") as f:
+                lines = loader(f)
+                for line in lines:
+                    yield line
+    return reader
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -15,6 +15,7 @@
 import paddle.v2.dataset.common
 import unittest
 import tempfile
+import glob
 class TestCommon(unittest.TestCase):
@@ -32,6 +33,30 @@ class TestCommon(unittest.TestCase):
            paddle.v2.dataset.common.download(
                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+    def test_split(self):
+        def test_reader():
+            def reader():
+                for x in xrange(10):
+                    yield x
+            return reader
+        _, temp_path = tempfile.mkstemp()
+        paddle.v2.dataset.common.split(
+            test_reader(), 4, suffix=temp_path + '/test-%05d.pickle')
+        files = glob.glob(temp_path + '/test-%05d.pickle')
+        self.assertEqual(len(files), 3)
+    def test_cluster_file_reader(self):
+        _, temp_path = tempfile.mkstemp()
+        for x in xrange(5):
+            with open(temp_path + '/%05d.test' % x) as f:
+                f.write('%d\n' % x)
+        reader = paddle.v2.dataset.common.cluster_files_reader(
+            temp_path + '/*.test', 5, 0)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str("0"))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -152,7 +152,7 @@ def __get_used_layers__(output_layers, extra_layers=None):
    return layer_names
-def __get_used_parameters__(layer_names):
+def __get_used_parameters__(layer_names, sub_models):
    parameter_names = set()
    for name in layer_names:
        l = cp.g_layer_map[name]
@@ -161,6 +161,12 @@ def __get_used_parameters__(layer_names):
                parameter_names.add(inp.input_parameter_name)
        if l.bias_parameter_name:
            parameter_names.add(l.bias_parameter_name)
+    for sub_model in sub_models:
+        for mem in sub_model.memories:
+            if mem.HasField("boot_bias_parameter_name"):
+                parameter_names.add(mem.boot_bias_parameter_name)
    return parameter_names
@@ -236,7 +242,6 @@ def parse_network(output_layers, extra_layers=None):
    layer_names = __get_used_layers__(output_layers + extra_layers)
    submodel_names = __get_used_submodels__(layer_names)
    submodel_names.add('root')
-    parameter_names = __get_used_parameters__(layer_names)
    evaluator_names = __get_used_evaluators__(layer_names)
    input_layer_names = set()
    output_layer_names = set()
@@ -251,10 +256,6 @@ def parse_network(output_layers, extra_layers=None):
            model_config.input_layer_names.append(l.name)
            input_layer_names.add(l.name)
-    for p in cp.g_config.model_config.parameters:
-        if p.name in parameter_names:
-            model_config.parameters.extend([p])
    for layer in output_layers:
        model_config.output_layer_names.append(layer.full_name)
        output_layer_names.add(layer.full_name)
@@ -269,6 +270,13 @@ def parse_network(output_layers, extra_layers=None):
                                  output_layer_names, evaluator_names)
            model_config.sub_models.extend([s])
+    parameter_names = __get_used_parameters__(layer_names,
+                                              model_config.sub_models)
+    for p in cp.g_config.model_config.parameters:
+        if p.name in parameter_names:
+            model_config.parameters.extend([p])
    return model_config

--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -73,7 +73,7 @@ class AggregateLayerTest(unittest.TestCase):
        pool = layer.pooling(
            input=pixel,
            pooling_type=pooling.Avg(),
-            agg_level=layer.AggregateLevel.EACH_SEQUENCE)
+            agg_level=layer.AggregateLevel.TO_SEQUENCE)
        last_seq = layer.last_seq(input=pixel)
        first_seq = layer.first_seq(input=pixel)
        concat = layer.concat(input=[last_seq, first_seq])
@@ -109,7 +109,7 @@ class ReshapeLayerTest(unittest.TestCase):
        expand = layer.expand(
            input=weight,
            expand_as=pixel,
-            expand_level=layer.ExpandLevel.FROM_TIMESTEP)
+            expand_level=layer.ExpandLevel.FROM_NO_SEQUENCE)
        repeat = layer.repeat(input=pixel, num_repeats=4)
        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
        rotate = layer.rotate(input=pixel, height=16, width=49)