diff --git a/Dockerfile b/Dockerfile
index ccd43be668e7acb1a82bb88f5938755a5d3974d1..c4502e863f2d9fb771f88218a795a44283818186 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 # A image for building paddle binaries
 # Use cuda devel base image for both cpu and gpu environment
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG UBUNTU_MIRROR
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index c8267b08705fd0a366eb22b8aa50517383222f9f..af9be86961833dcd62371227165d411a3b61d79e 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -15,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
+    ${CUDNN_ROOT}/lib/x86_64-linux-gnu
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index b36e217999e7d54217d5d6ca02d28c8050b648fa..0afb3ab9af48046af01f03838eefa0bd2fcb2821 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -38,6 +38,10 @@ ExternalProject_Add(
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index 11be58e27ed6b86164f07ac0a4d0dc5f9422d2d7..4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -42,6 +42,10 @@ ExternalProject_Add(
     CMAKE_ARGS      -DWITH_GFLAGS=ON
     CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
     CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 25fa98ee5f3d7d2adc99852b4226f291c1825856..49c7d71443cda700a14af6be65ff6658eec7229f 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -45,11 +45,15 @@ IF(WITH_TESTING)
         CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
         CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
         CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
         CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
         CMAKE_ARGS      -DBUILD_GMOCK=ON
         CMAKE_ARGS      -Dgtest_disable_pthreads=ON
         CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_BUILD_TYPE:STRING=Release
     )
     LIST(APPEND external_project_dependencies gtest)
 ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index c99b37b62d7f3c2caa1113db5485491c83d13e28..92ea23c7633e974fd09251f967965364b1928307 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -29,7 +29,24 @@ IF(NOT ${CBLAS_FOUND})
 
     IF(CMAKE_COMPILER_IS_GNUCC)
         ENABLE_LANGUAGE(Fortran)
-        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+        if (NOT CMAKE_Fortran_COMPILER_VERSION)
+          # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
+          execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
+                    OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
+        endif()
+        string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
+        list(GET Fortran_VERSION 0 Fortran_MAJOR)
+        list(GET Fortran_VERSION 1 Fortran_MINOR)
+        find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS 
+                     /lib
+                     /usr/lib
+                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
+                     /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
+        if (NOT GFORTRAN_LIBRARY)
+            message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
+        endif()
+        find_package(Threads REQUIRED)
+        LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
     ENDIF(CMAKE_COMPILER_IS_GNUCC)
 
     IF(NOT CMAKE_Fortran_COMPILER)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index ad1426fd940c7b163668c33d41731fe75d89dd89..2df042d226af8308d00f7870e7d2de0eacfdf07e 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -58,12 +58,20 @@ IF(NOT PROTOBUF_FOUND)
         GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
         CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-        -Dprotobuf_BUILD_TESTS=OFF
-        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-        -DCMAKE_INSTALL_LIBDIR=lib
+            -Dprotobuf_BUILD_TESTS=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_INSTALL_LIBDIR=lib
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+            -DZLIB_ROOT:STRING=${ZLIB_ROOT}
     )
 
     LIST(APPEND external_project_dependencies protobuf)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index e9a1eac01a7702197560e21282b9d75b0375e1d3..293070c3cfcc1196001f64469f3254289b0de792 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -56,8 +56,13 @@ ExternalProject_Add(
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
     CMAKE_ARGS      -DWITH_TORCH=OFF
-    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
     CMAKE_ARGS      -DBUILD_SHARED=ON
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 
 LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 4331e1f31a145a66121bf2c55bbc88ac80d3997f..45ca5542b7dc30216b45487782f849b93c5f8fca 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -42,6 +42,10 @@ ExternalProject_Add(
     CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies zlib)
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index af889ec9d1b4f43f8e4a266b21822f773ab62ec2..22db1ef658ca35f0ab18895c1da1003bd3cd93fa 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -4,119 +4,139 @@ PaddlePaddle的Docker容器使用方式
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
 
-纯CPU和GPU的docker镜像使用说明
+PaddlePaddle发布的docker镜像使用说明
 ------------------------------
 
-对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。
-我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像：
-`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
+对于每一个PaddlePaddle版本，我们都会发布两种Docker镜像：开发镜像、运行镜像。运行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。
+我们会在 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新的docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
+1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
 
-以交互容器方式运行纯CPU的镜像：
+    这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
+    文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
+    开发镜像包含了以下工具：
+    - gcc/clang
+    - nvcc
+    - Python
+    - sphinx
+    - woboq
+    - sshd
+    很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
+    也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
 
-.. code-block:: bash
+    以交互容器方式运行开发镜像：
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+    .. code-block:: bash
 
-或者，可以以后台进程方式运行容器：
+        docker run -it --rm paddledev/paddle:<version>-dev /bin/bash
 
-.. code-block:: bash
+    或者，可以以后台进程方式运行容器：
 
-    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+    .. code-block:: bash
 
-然后用密码 :code:`root` SSH进入容器：
+        docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
 
-.. code-block:: bash
+    然后用密码 :code:`root` SSH进入容器：
 
-    ssh -p 2202 root@localhost
+    .. code-block:: bash
 
-SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
+        ssh -p 2202 root@localhost
 
+    SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
-以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
+2. 运行镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
+    - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+    - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+    - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+    - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-.. code-block:: bash
+    纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+    .. code-block:: bash
 
+       if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-运行PaddlePaddle书籍
----------------------
+    如果输出是No，就需要选择使用no-AVX的镜像
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+    以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
+    为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
 
-PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
-如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+    .. code-block:: bash
 
-当您进入容器内之后，只用运行以下命令：
+        nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
 
-.. code-block:: bash
-        
-    jupyter notebook
+    注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
 
-然后在浏览器中输入以下网址：
-    
-.. code-block:: text
+    .. code-block:: bash
 
-    http://localhost:8888/
+        export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+        export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+        docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
 
-就这么简单，享受您的旅程！
+3. 使用运行镜像发布你的AI程序
+    假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
 
+    .. code-block:: bash
 
-非AVX镜像
----------
+        docker run -it -v $PWD:/work paddle /work/a.py
 
-纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
+    这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+    创建和发布自己的AI程序镜像。
 
-.. code-block:: bash
+运行PaddlePaddle书籍
+---------------------
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
+PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+
+我们提供可以直接运行PaddlePaddle书籍的docker镜像，直接运行：
 
 .. code-block:: bash
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+    docker run -p 8888:8888 paddlepaddle/book
 
+然后在浏览器中输入以下网址：
+
+.. code-block:: text
+
+    http://localhost:8888/
+
+就这么简单，享受您的旅程！
 
 通过Docker容器开发PaddlePaddle
 ------------------------------
 
-开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+
+1. 构建开发镜像
 
-1. 将开发环境构建为Docker镜像
-   
    .. code-block:: bash
 
       git clone --recursive https://github.com/PaddlePaddle/Paddle
       cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+      docker build -t paddle:dev .
 
 
-   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要设置一个参数：
+   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要构建完开发镜像，然后执行：
 
    .. code-block:: bash
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+      docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
 
 
 2. 运行开发环境
 
    当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
-   
+
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev sshd
 
    以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
 
-   请注意， :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
-   
+   以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
+
    .. code-block:: bash
 
       ssh root@localhost -p 2202
@@ -124,13 +144,13 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 3. 在Docker开发环境中编译与安装PaddlPaddle代码
 
    当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
-   
+
    .. code-block:: bash
-		      
+
       /paddle/paddle/scripts/docker/build.sh
 
    以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
-   
+
    .. code-block:: bash
 
       cd /paddle/build
@@ -140,14 +160,14 @@ PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nod
 文档
 ----
 
-Paddle的Docker镜像带有一个通过 `woboq code browser
+Paddle的Docker开发镜像带有一个通过 `woboq code browser
 <https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
 只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>-dev
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 606746597acc0da00588b7eb05935f6c05c169f2..8fb9369e0e8e31e620169fa2856094c414efe23e 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -8,173 +8,255 @@ Please be aware that you will need to change `Dockers settings
 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
 of your hardware resource on Mac OS X and Windows.
 
+Working With Docker
+-------------------
+
+Docker is simple as long as we understand a few basic concepts:
+
+- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+
+  .. code-block:: bash
+
+     docker images
+
+  to list all images in the system. We can also run
+
+  .. code-block:: bash
+		  
+     docker pull paddlepaddle/paddle:0.10.0rc2
+
+  to download a Docker image, paddlepaddle/paddle in this example,
+  from Dockerhub.com.
+
+- *container*: considering a Docker image a program, a container is a
+  "process" that runs the image. Indeed, a container is exactly an
+  operating system process, but with a virtualized filesystem, network
+  port space, and other virtualized environment. We can type
+
+  .. code-block:: bash
+
+     docker run paddlepaddle/paddle:0.10.0rc2
+
+  to start a container to run a Docker image, paddlepaddle/paddle in this example.
+
+- By default docker container have an isolated file system namespace,
+  we can not see the files in the host file system. By using *volume*,
+  mounted files in host will be visible inside docker container.
+  Following command will mount current dirctory into /data inside
+  docker container, run docker container from debian image with
+  command :code:`ls /data`.
+
+  .. code-block:: bash
+
+     docker run --rm -v $(pwd):/data debian ls /data
 
 Usage of CPU-only and GPU Images
 ----------------------------------
 
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
-and `paddledev/paddle:0.10.0rc1-gpu`.
+For each version of PaddlePaddle, we release two types of Docker images:
+development image and production image. Production image includes
+CPU-only version and a CUDA GPU version and their no-AVX versions. We
+put the docker images on `dockerhub.com
+<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
+latest versions under "tags" tab at dockerhub.com
 
-To run the CPU-only image as an interactive container:
+1. Production images, this image might have multiple variants:
 
-.. code-block:: bash
+   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+   Please be aware that the CPU-only and the GPU images both use the
+   AVX instruction set, but old computers produced before 2008 do not
+   support AVX.  The following command checks if your Linux computer
+   supports AVX:
 
-or, we can run it as a daemon container
+   .. code-block:: bash
 
-.. code-block:: bash
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+   
+   To run the CPU-only image as an interactive container:
 
-and SSH to this container using password :code:`root`:
+   .. code-block:: bash
 
-.. code-block:: bash
+      docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
 
-    ssh -p 2202 root@localhost
+   Above method work with the GPU image too -- the recommended way is
+   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
 
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
+   Please install nvidia-docker first following this `tutorial
+   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
 
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
+   Now you can run a GPU image:
 
-.. code-block:: bash
+   .. code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
 
+2. development image :code:`paddlepaddle/paddle:<version>-dev`
 
-PaddlePaddle Book
-------------------
+   This image has packed related develop tools and runtime
+   environment. Users and developers can use this image instead of
+   their own local computer to accomplish development, build,
+   releasing, document writing etc. While different version of paddle
+   may depends on different version of libraries and tools, if you
+   want to setup a local environment, you must pay attention to the
+   versions.  The development image contains:
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+     
+   Many developers use servers with GPUs, they can use ssh to login to
+   the server and run :code:`docker exec` to enter the docker
+   container and start their work.  Also they can start a development
+   docker image with SSHD service, so they can login to the container
+   and start work.
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
 
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
-We already exposed port 8888 for this book. If you want to
-dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+Train Model Using Python API
+----------------------------
 
-Once you are inside the container, simply issue the command:
+Our official docker image provides a runtime for PaddlePaddle
+programs. The typical workflow will be as follows:
+
+Create a directory as workspace:
 
 .. code-block:: bash
-        
-    jupyter notebook
 
-Then, you would back and paste the address into the local browser:
-    
-.. code-block:: text
+   mkdir ~/workspace
 
-    http://localhost:8888/
+Edit a PaddlePaddle python program using your favourite editor
 
-That's all. Enjoy your journey!
+.. code-block:: bash
 
+   emacs ~/workspace/example.py
 
-Non-AVX Images
---------------
+Run the program using docker:
 
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
+.. code-block:: bash
+
+   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
+
+Or if you are using GPU for training:
 
 .. code-block:: bash
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
 
+Above commands will start a docker container by running :code:`python
+/workspace/example.py`. It will stop once :code:`python
+/workspace/example.py` finishes.
 
-If it doesn't, we will need to build non-AVX images manually from
-source code:
+Another way is to tell docker to start a :code:`/bin/bash` session and
+run PaddlePaddle program interactively:
 
 .. code-block:: bash
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
+   # now we are inside docker container
+   cd /workspace
+   python example.py
 
+Running with GPU is identical:
 
-Development Using Docker
-------------------------
+.. code-block:: bash
 
-Developers can work on PaddlePaddle using Docker.  This allows
-developers to work on different platforms -- Linux, Mac OS X, and
-Windows -- in a consistent way.
+   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+   # now we are inside docker container
+   cd /workspace
+   python example.py
 
-1. Build the Development Environment as a Docker Image
 
-   .. code-block:: bash
+Develop PaddlePaddle or Train Model Using C++ API
+---------------------------------------------------
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle
-      cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+We will be using PaddlePaddle development image since it contains all
+compiling tools and dependencies.
 
+Let's clone PaddlePaddle repo first:
 
-   Note that by default :code:`docker build` wouldn't import source
-   tree into the image and build it.  If we want to do that, we need
-   to set a build arg:
+.. code-block:: bash
 
-   .. code-block:: bash
+   git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+Mount both workspace folder and paddle code folder into docker
+container, so we can access them inside docker container. There are
+two ways of using PaddlePaddle development docker image:
 
+- run interactive bash directly
 
-2. Run the Development Environment
+  .. code-block:: bash
 
-   Once we got the image :code:`paddle:dev`, we can use it to develop
-   Paddle by mounting the local source code tree into a container that
-   runs the image:
+     # use nvidia-docker instead of docker if you need to use GPU
+     docker run -it -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /bin/bash
+     # now we are inside docker container
 
-   .. code-block:: bash
+- or, we can run it as a daemon container
 
-      docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev
+  .. code-block:: bash
 
-   This runs a container of the development environment Docker image
-   with the local source tree mounted to :code:`/paddle` of the
-   container.
+     # use nvidia-docker instead of docker if you need to use GPU
+     docker run -d -p 2202:22 -p 8888:8888 -v ~/workspace:/workspace -v $(pwd):/paddle paddlepaddle/paddle:0.10.0rc2-dev /usr/sbin/sshd -D
 
-   Note that the default entry-point of :code:`paddle:dev` is
-   :code:`sshd`, and above :code:`docker run` commands actually starts
-   an SSHD server listening on port 2202.  This allows us to log into
-   this container with:
+  and SSH to this container using password :code:`root`:
 
-   .. code-block:: bash
+  .. code-block:: bash
 
-      ssh root@localhost -p 2202
+     ssh -p 2202 root@localhost
 
-   Usually, I run above commands on my Mac.  I can also run them on a
-   GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
+  An advantage is that we can run the PaddlePaddle container on a
+  remote server and SSH to it from a laptop.
 
-   .. code-block:: bash
+When developing PaddlePaddle, you can edit PaddlePaddle source code
+from outside of docker container using your favoriate editor. To
+compile PaddlePaddle, run inside container:
 
-      my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
+.. code-block:: bash
 
-3. Build and Install Using the Development Environment
+   WITH_GPU=OFF WITH_AVX=ON WITH_TEST=ON bash /paddle/paddle/scripts/docker/build.sh
 
-   Once I am in the container, I can use
-   :code:`paddle/scripts/docker/build.sh` to build, install, and test
-   Paddle:
+This builds everything about Paddle in :code:`/paddle/build`.  And we
+can run unit tests there:
 
-   .. code-block:: bash
+.. code-block:: bash
 
-      /paddle/paddle/scripts/docker/build.sh
+   cd /paddle/build
+   ctest
 
-   This builds everything about Paddle in :code:`/paddle/build`.  And
-   we can run unit tests there:
+When training model using C++ API, we can edit paddle program in
+~/workspace outside of docker. And build from /workspace inside of
+docker.
 
-   .. code-block:: bash
+PaddlePaddle Book
+------------------
+
+The Jupyter Notebook is an open-source web application that allows
+you to create and share documents that contain live code, equations,
+visualizations and explanatory text in a single browser.
+
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
+We already exposed port 8888 for this book. If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
+
+.. code-block:: bash
 
-      cd /paddle/build
-      ctest
+    docker run -p 8888:8888 paddlepaddle/book
+
+Then, you would back and paste the address into the local browser:
+
+.. code-block:: text
+
+    http://localhost:8888/
+
+That's all. Enjoy your journey!
 
 
 Documentation
@@ -191,7 +273,7 @@ container:
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
index d02d9c63bbfb50954d7b75f2c685ce167a3b7146..9e39ccb00f5d5655c30148900a3d76a22aacfc01 100644
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
         with_double: OFF
         with_python: ON
         with_rdma: OFF
-        with_metric_learning:
         with_timer: OFF
         with_predict_sdk:
 
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 503024cff338dac42a6a8a32463472dc6b6451d9..9d6d67e62c106b2298ce1ebae5633d03bba1e684 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,13 +9,8 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
-
-if(WITH_PREDICT_SDK)
-    add_subdirectory(predict)
-endif()
-
 if(WITH_SWIG_PY)
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+          ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
   add_subdirectory(api)
 endif()
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 3760c6727c21cfb32ca4d2efc30351352c9b182b..4d0dacae9058f94e584f313c9d0e31b5af09e82d 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -76,8 +76,6 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${CMAKE_DL_LIBS}
     ${EXTERNAL_LIBS}
     ${CMAKE_THREAD_LIBS_INIT}
-    ${RDMA_LD_FLAGS}
-    ${RDMA_LIBS}
     ${START_END}
 )
 
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9f9d8f972e3a4c62e5caedcf85054be5681b96c1..973ddcceed99ba4177b3db277e664611d42ac51b 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst,
                                     int width,
                                     const int mode);
 
+extern void hl_sequence_avg_backward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode);
 #endif /* HL_SEQUENCE_H_ */
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 05e51bce9e1df6fc6ef1cad891b44a9172da185d..920b417b1c717efaff75f70f1b9d2b574469e425 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst,
                                     int width,
                                     const int mode) {}
 
+inline void hl_sequence_avg_backward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode) {}
 #endif  // HL_SEQUENCE_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index ba823de2720336851bf9c49d8162360af93e8601..0fe2877f89f8d0fbc4db40c400037be30bb87ff7 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst,
     int seqLength = end - start;
     if (seqLength == 0) return;
     real sum = 0.0;
-    for (int i = 0; i < seqLength; i++) {
-      sum += src[(start + i) * width + col];
+    for (int i = start; i < end; i++) {
+      sum += src[i * width + col];
     }
     sum = mode == 1 ? sum :
         (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[row * width + col] = sum;
+    dst[gid] = sum;
   }
 }
 
@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst,
            (dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_forward failed");
 }
+
+__global__ void KeSequenceAvgBackward(real* dst,
+                                      real* src,
+                                      const int* starts,
+                                      int height,
+                                      int width,
+                                      const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real grad = src[gid];
+    grad = mode == 1 ? grad :
+        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+    for (int i = start; i < end; i++) {
+      dst[i * width + col] += grad;
+    }
+  }
+}
+
+void hl_sequence_avg_backward(real* dst,
+                              real* src,
+                              const int* starts,
+                              int height,
+                              int width,
+                              const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+    << "mode error in hl_sequence_avg_backward!";
+
+  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
+           (dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_backward failed");
+}
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index b8955ab04f209629c855ed66f8e8e9701b7224a3..96cc4288c6faad4b80c790ed2ce6f5128ea83b6d 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
   SequencePoolLayer::init(layerMap, parameterMap);
 
-  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
-  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
   // average strategy
   if (config_.average_strategy() == "average") {
     mode_ = kAverage;
@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) {
 void AverageLayer::backward(const UpdateCallback& callback) {
   SequencePoolLayer::backward(callback);
 
-  const int* starts = startPositions_->getData(false);
-  MatrixPtr grad = getInputGrad(0);
-
-  if (grad) {
-    size_t dim = getSize();
-    real* gradientData = getInputGrad(0)->getData();
-    real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions_->getSize() - 1;
-    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-      // TODO(Dangqingqing) optimization for GPU
-      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-      if (0 == sequenceLength) {
-        // empty sequence
-        continue;
-      }
-      dataMtx_->setData(
-          gradientData + starts[sequenceId] * dim, sequenceLength, dim);
-      outMtx_->setData(gradient + sequenceId * dim);
-      switch (mode_) {
-        case kAverage: {
-          // plain average
-          dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
-          break;
-        }
-        case kSum: {
-          // sum instead of average
-          dataMtx_->addBias(*outMtx_, 1.0f);
-          break;
-        }
-        case kAverageSquareRootN: {
-          // divide by square root of sequenceLength
-          dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
-          break;
-        }
-        default: { LOG(FATAL) << "should not reach here"; }
-      }
-    }
+  if (getInputGrad(0)) {
+    getInputGrad(0)->sequenceAvgBackward(
+        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
   }
 }
 
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index 621e1d7bb12ec5b8c7a6173bd601835d9406e814..332552a30479a368c24db10e5ef3a9d59408c8ef 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -45,8 +45,6 @@ public:
   void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  MatrixPtr outMtx_;
-  MatrixPtr dataMtx_;
   int mode_;
 };
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index b408abbf321f7b8ddc3a30bf11434e3c3211966d..55a7344495f8e57dc95095ab1b81b45008fa9acc 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -483,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
   hl_sequence_avg_forward(dst, src, starts, height, width, mode);
 }
 
+void GpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
+}
+
 /* this = scaleAB*(a*b) +  scaleT*this */
 void GpuMatrix::mul(const GpuMatrix& a,
                     const GpuMatrix& b,
@@ -2304,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
   }
 }
 
+void CpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; ++i) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
+    dataMtx->setData(src + i * width);
+    if (mode == 0) {
+      // plain average
+      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->addBias(*dataMtx, 1.0f);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
 /* this = scaleAB*(a*b) + scaleT*this*/
 void CpuMatrix::mul(const Matrix& a,
                     const Matrix& b,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index dbdb629614546b7c7b569d7473d96a06d0c5a9c7..3252adb19e4c2e48f86c3c811bfc7d75fd06a8f7 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -461,6 +461,12 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void sequenceAvgBackward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
   /**
    * @code
    * this = scaleAB*(a*b) + scaleT*this
@@ -1203,6 +1209,7 @@ public:
   void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
 
   /**
    * @code
@@ -1619,6 +1626,7 @@ public:
   void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
 
   /**
    * @code
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 08b64c1bb6f5d359a2d2164e723a76c5360168ee..dd19fe516fbf724a86479e6f27032614ab4c6106 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -685,7 +685,7 @@ TEST(SMatrix, topK) {
   }
 }
 
-void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
+void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
@@ -706,15 +706,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
 
   TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInGrad->randomizeUniform();
+  gpuInGrad->copyFrom(*cpuInGrad);
+
+  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
+  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuInGrad, *gpuInGrad);
 }
 
-TEST(Matrix, sequenceAvgForward) {
+TEST(Matrix, sequenceAvg) {
   for (auto batchSize : {10, 128, 6000}) {
     for (auto inputDim : {32, 100, 512}) {
       for (auto mode : {0, 1, 2}) {
         VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
                 << " mode=" << mode;
-        testMatrixSequenceAvgForward(batchSize, inputDim, mode);
+        testMatrixSequenceAvg(batchSize, inputDim, mode);
       }
     }
   }
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index a0da561dfe962b7a0a0515d4104940175ebdecad..e6ed01428a63b9c55bf6ec299ea1c8bff71f3b65 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -4,7 +4,7 @@ set -e
 
 # Set BASE_IMAGE according to env variables
 if [ ${WITH_GPU} == "ON" ]; then
-  BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
+  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu14.04"
   # additional packages to install when building gpu images
   GPU_DOCKER_PKG="python-pip python-dev"
 else
@@ -12,11 +12,10 @@ else
 fi
 
 DOCKERFILE_GPU_ENV=""
+DOCKERFILE_CUDNN_DSO=""
 if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
     DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-
-    # for cmake to find cudnn
-    ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
 fi
 
 mkdir -p /paddle/build
@@ -95,7 +94,10 @@ RUN ${MIRROR_UPDATE}
 # Use different deb file when building different type of images
 ADD build/*.deb /usr/local/opt/paddle/deb/
 # run paddle version to install python packages first
-RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb && paddle version
+RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && \
+    rm -f /usr/local/opt/paddle/deb/*.deb && \
+    paddle version
+${DOCKERFILE_CUDNN_DSO} 
 ${DOCKERFILE_GPU_ENV}
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 5a45df4072b9197a713bd19ee766296279bfcbc8..8fba4a19ba2cecd551aa4bbc764acc94615dd115 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,9 +21,7 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
-        echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }
 
 function ver2num() {
diff --git a/python/paddle/v2/plot/plot_curve.py b/python/paddle/v2/plot/plot_curve.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f62674cb2baad9e4ecd9f6655f7e2dc00173dc6
--- /dev/null
+++ b/python/paddle/v2/plot/plot_curve.py
@@ -0,0 +1,48 @@
+from IPython import display
+import os
+
+
+class PlotCost(object):
+    """
+    append train and test cost in event_handle and then call plot.
+    """
+
+    def __init__(self):
+        self.train_costs = ([], [])
+        self.test_costs = ([], [])
+
+        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
+        if not self.__plot_is_disabled__():
+            import matplotlib.pyplot as plt
+            self.plt = plt
+
+    def __plot_is_disabled__(self):
+        return self.__disable_plot__ == "True"
+
+    def plot(self):
+        if self.__plot_is_disabled__():
+            return
+
+        self.plt.plot(*self.train_costs)
+        self.plt.plot(*self.test_costs)
+        title = []
+        if len(self.train_costs[0]) > 0:
+            title.append('Train Cost')
+        if len(self.test_costs[0]) > 0:
+            title.append('Test Cost')
+        self.plt.legend(title, loc='upper left')
+        display.clear_output(wait=True)
+        display.display(self.plt.gcf())
+        self.plt.gcf().clear()
+
+    def append_train_cost(self, step, cost):
+        self.train_costs[0].append(step)
+        self.train_costs[1].append(cost)
+
+    def append_test_cost(self, step, cost):
+        self.test_costs[0].append(step)
+        self.test_costs[1].append(cost)
+
+    def reset(self):
+        self.train_costs = ([], [])
+        self.test_costs = ([], [])