Merge branch 'develop' of github.com:baidu/Paddle into feature/c_api

88c38623 · Yu Yang · 64022143 · 7000cb61 · 88c38623 · 88c38623
80 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,9 +25,9 @@ addons:
    packages:
      - gcc-4.8
      - g++-4.8
+      - gfortran-4.8
      - git
      - build-essential
-      - libatlas-base-dev
      - python
      - python-pip
      - python2.7-dev

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" OFF)
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,7 @@
 set(CBLAS_FOUND OFF)
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
  ${MKL_ROOT}/include)

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -15,7 +15,6 @@
 INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})
-    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
    INCLUDE(ExternalProject)
    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND})
        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
    ENDIF(WIN32)
+    IF(CMAKE_COMPILER_IS_GNUCC)
+        ENABLE_LANGUAGE(Fortran)
+        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+    IF(NOT CMAKE_Fortran_COMPILER)
+        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
+                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
+    ENDIF(NOT CMAKE_Fortran_COMPILER)
    ExternalProject_Add(
        openblas
        ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
        PREFIX              ${CBLAS_SOURCES_DIR}
        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
        BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
-        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+    ExternalProject_Add_Step(
+        openblas lapacke_install
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
+        DEPENDEES install
    )
    LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
  CONFIGURE_COMMAND
    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
    -Dprotobuf_BUILD_TESTS=OFF
+    -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    -DCMAKE_BUILD_TYPE=Release
    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -31,6 +31,7 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
        "please use pip to upgrade protobuf.")
    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
    ##################################### PYTHON ########################################
    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)

--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND)
        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
    ELSE(WIN32)
-        # From PCRE configure
-        ExternalProject_Add(pcre
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY https://github.com/svn2github/pcre.git
-            PREFIX ${SWIG_SOURCES_DIR}/pcre
-            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
-        )
        # swig uses bison find it by cmake and pass it down
        FIND_PACKAGE(BISON)
@@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND)
            GIT_REPOSITORY      https://github.com/swig/swig.git
            GIT_TAG             rel-3.0.10
            PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
+            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
+                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
-            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
+            BUILD_COMMAND       cd <SOURCE_DIR> && make
-            ./configure
+            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
-                --prefix=${SWIG_INSTALL_DIR}
+            UPDATE_COMMAND      ""
-                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
-            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
-            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
-            UPDATE_COMMAND  ""
-            DEPENDS pcre
        )
        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -96,6 +96,7 @@ set(COMMON_FLAGS
    -Wno-unused-parameter
    -Wno-unused-function
    -Wno-error=literal-suffix
+    -Wno-error=sign-compare
    -Wno-error=unused-local-typedefs)
 set(GPU_COMMON_FLAGS
@@ -105,6 +106,7 @@ set(GPU_COMMON_FLAGS
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
    -Wno-unused-function
+    -Wno-error=sign-compare
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.

--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -21,6 +21,7 @@ ELSE(WIN32)
        SET(MACOS_VERSION ${VERSION})
        SET(HOST_SYSTEM "macosx")
    ELSE(APPLE)
        IF(EXISTS "/etc/issue")
            FILE(READ "/etc/issue" LINUX_ISSUE)
            IF(LINUX_ISSUE MATCHES "CentOS")
@@ -31,6 +32,14 @@ ELSE(WIN32)
                SET(HOST_SYSTEM "ubuntu")
            ENDIF()
        ENDIF(EXISTS "/etc/issue")
+        IF(EXISTS "/etc/redhat-release")
+            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ENDIF()
+        ENDIF(EXISTS "/etc/redhat-release")
    ENDIF(APPLE)
 ENDIF(WIN32)
@@ -47,7 +56,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
    LOG_DOWNLOAD    0     # Wrap download in script to log output
    LOG_UPDATE      1     # Wrap update in script to log output
    LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
    LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME)
    endif()
    if(WITH_GPU)
+        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
        if(NOT WITH_DSO OR WITH_METRIC)
            target_link_libraries(${TARGET_NAME}
                ${CUDNN_LIBRARY}

--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
 python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
    --save_dir=./output \
    --num_passes=30 \
    2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=100 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
 python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
 pushd "$home_dir"
 cfg=trainer_config.lr.py
 paddle train \
+  --start_pserver=false \
  --config=$cfg \
  --save_dir=${model_dir} \
  --trainer_count=4 \

--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
    --init_model_path=$model \
    --config_args=is_predict=1 \
    --predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
 mv rank-00000 result.txt
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
  --show_parameter_stats_period=100 \
  --test_all_data_in_one_period=1 \
  2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
    --log_period=100 \
    --dot_period=1 \
    --num_passes=50  2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
  --config_args=is_test=1 \
  --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
  --load_missing_parameter_strategy=rand \
  --test_all_data_in_one_period=1 \
  2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
             --trainer_count=4 \
             --config_args=is_test=1 \
             2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
             --show_parameter_stats_period=100 \
             --test_all_data_in_one_period=1 \
             2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
    --log_period=10 \
    --dot_period=5 \
    2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
    --test_pass=12 \
    --trainer_count=1 \
    2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
 --log_period=10 \
 --dot_period=5 \
 2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
       --dot_period=10 \
       --log_period=1000 \
       --test_period=0 \
-       --num_passes=10
+       --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
       --log_period=10000 \
       --test_period=0 \
       --num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
--- a/demo/traffic_prediction/predict.sh
+++ b/demo/traffic_prediction/predict.sh
@@ -25,6 +25,6 @@ paddle train \
    --config_args=is_predict=1 \
    --predict_output_dir=. 
-python gen_result.py > result.txt
+python gen_result.py > result.csv
 rm -rf rank-00000
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
---------------------------------------------------
-目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
-:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
-..  code-block:: bash
-    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
-..  code-block:: bash
-    git submodule init
-    git submodule update
-来获得所有第三方模块。
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Centos](#centos)
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@@ -11,32 +13,22 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
-git submodule update --init --recursive
-```
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
 ```
-git submodule update --remote
-```
 ## <span id="requirements">Requirements</span>
 To compile the source code, your computer must be equipped with the following dependencies.
- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
- **CMake**: version >= 2.8
+- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
+- **Python**: only support Python 2.7
- **Python**: only python 2.7 is supported currently
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
 ### Options
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+PaddlePaddle supports some build options. 
 <html>
 <table> 
@@ -47,12 +39,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </tr>
 </thead>
 <tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
+<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
+<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
+<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
+<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
+<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
+<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
+<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
+<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
+<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
+<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
+<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
 </table>
 </html>
@@ -64,18 +65,16 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 As a simple example, consider the following:  
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
-    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
    ```bash
-    # install swig on ubuntu
+    # specify MKL
-    sudo apt-get install swig
+    cmake .. -DMKL_ROOT=<mkl_path>
-    # install swig on Mac OS X
+    # or specify OpenBLAS
-    brew install swig
+    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    # active swig in cmake
-    cmake .. -DWITH_SWIG_PY=ON
    ```
 2. **Doc Dependencies(optional)**
@@ -99,24 +98,21 @@ As a simple example, consider the following:
 ### Install Dependencies
- **CPU Dependencies**
+- **Paddle Dependencies**
    ```bash
    # necessary
    sudo apt-get update
-    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    # optional
+    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo apt-get install libgoogle-glog-dev
+    sudo pip install 'protobuf==3.1.0.post1'
-    sudo apt-get install libgflags-dev
-    sudo apt-get install libgtest-dev
+    # install cmake 3.4
-    sudo pip install wheel
+    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-    pushd /usr/src/gtest
+        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-    cmake .
+        cd .. && rm -rf cmake-3.4.1
-    make
-    sudo cp *.a /usr/lib
-    popd
    ```
 - **GPU Dependencies (optional)**
    To build GPU version, you will need the following installed:
@@ -149,51 +145,78 @@ As usual, the best option is to create build folder under paddle project directo
 ```bash
 mkdir build && cd build
-cmake ..
+``` 
+Finally, you can build and install PaddlePaddle:
+```bash
+# you can add build option here, such as:    
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
+# install PaddlePaddle Python modules.
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+## <span id="centos">Build on Centos 7</span>
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+### Install Dependencies
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-As a simple example, consider the following:
+- **CPU Dependencies**
- **Only CPU with swig**
+    ```bash
+    # necessary
+    sudo yum update
+    sudo yum install -y epel-release
+    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
+    ```
+- **GPU Dependencies (optional)**
-  ```bash
+    To build GPU version, you will need the following installed:
-  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
-  ```
- **GPU with swig**
-  ```bash
+        1. a CUDA-capable GPU
-  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
+        2. A supported version of Linux with a gcc compiler and toolchain
-  ```
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+    After downloading cuDNN library, issue the following commands:
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
- **GPU with doc and swig**
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+### Build and Install
+As usual, the best option is to create build folder under paddle project directory.
-  ```bash
+```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+mkdir build && cd build
-  ``` 
+``` 
-Finally, you can build PaddlePaddle:
+Finally, you can build and install PaddlePaddle:
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
-```
+# install PaddlePaddle Python modules.
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-```bash
-# you can run
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
 ```
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -40,4 +40,4 @@ PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。
    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
 注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,23 +16,13 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
-The general development workflow with Docker and Bazel is as follows:
+The general development workflow with Docker and CMake is as follows:
 1. Get the source code of Paddle:
   .. code-block:: bash
-      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+      git clone https://github.com/PaddlePaddle/Paddle.git
-   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
-   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
-   empty, please use the following command to get the submodule.
-   .. code-block:: bash
-      git submodule update --init --recursive
 2. Build a development Docker image :code:`paddle:dev` from the source
@@ -162,7 +152,6 @@ source code:
   cd ~
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
-   git submodule update --init --recursive
   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .

--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
  - 输入：一个双层序列，或一个单层序列
@@ -54,7 +54,7 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
        last = last_seq(input=layer,
                        agg_level=AggregateLevel.EACH_SEQUENCE)
- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
  - 输入：一个双层序列或一个单层序列

--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -33,7 +33,6 @@ cd Paddle
 git checkout -b develop  # 创建 develop 分支
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
 git pull upstream develop  # 更新 upstream
-git submodule update --init --recursive
 ```
 然后你可以通过做一个本地开发分支开始开发

--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -38,7 +38,6 @@ cd Paddle
 git checkout -b develop  # create develop branch.
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
-git submodule update --init --recursive
 ```
 Then you can start to develop by making a local developement branch

--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
--- a/doc/howto/usage/k8s/src/pserver_and_trainer.png
+++ b/doc/howto/usage/k8s/src/pserver_and_trainer.png
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@"
+CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
 endif()
 set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cudart_wrap.cc
    src/hl_cuda_cublas.cc
    src/hl_cuda_cudnn.cc
    src/hl_cuda_device.cc)

--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
 */
 void GetCudnnDsoHandle(void** dso_handle);
-/**
- * @brief    load the DSO of CUDA Run Time
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudartDsoHandle(void** dso_handle);
 /**
 * @brief    load the DSO of CURAND
 *

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -22,10 +22,9 @@ limitations under the License. */
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
-#include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
 // clang-format on
@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-#endif
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)        \
-  __macro(cudaMalloc)                     \
-  __macro(cudaHostAlloc)                  \
-  __macro(cudaFree)                       \
-  __macro(cudaFreeHost)                   \
-  __macro(cudaMemcpy)                     \
-  __macro(cudaMemset)                     \
-  __macro(cudaMemcpyAsync)                \
-  __macro(cudaSetDevice)                  \
-  __macro(cudaGetDevice)                  \
-  __macro(cudaGetDeviceCount)             \
-  __macro(cudaGetDeviceProperties)        \
-  __macro(cudaDeviceSynchronize)          \
-  __macro(cudaDeviceCanAccessPeer)        \
-  __macro(cudaDeviceEnablePeerAccess)     \
-  __macro(cudaStreamCreate)               \
-  __macro(cudaStreamDestroy)              \
-  __macro(cudaStreamSynchronize)          \
-  __macro(cudaStreamWaitEvent)            \
-  __macro(cudaEventCreate)                \
-  __macro(cudaEventRecord)                \
-  __macro(cudaEventQuery)                 \
-  __macro(cudaEventDestroy)               \
-  __macro(cudaEventSynchronize)           \
-  __macro(cudaEventElapsedTime)           \
-  __macro(cudaSetDeviceFlags)             \
-  __macro(cudaGetLastError)               \
-  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)             \
-  __macro(cudaProfilerStart)              \
-  __macro(cudaProfilerStop)
-// clang-format on
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
 } /* namespace dynload */
 /**
@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0;
 * Check build-in cuda function using glog and it **does not**
 * support << operator for more details error info.
 */
-#define CHECK_CUDA(cudaFunc)                                                  \
+#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                                        \
+  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                          \
+    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << dynload::cudaGetErrorString(cudaStat); \
+                                    << cudaGetErrorString(cudaStat); \
  } while (0)
 /**
@@ -284,13 +211,13 @@ void hl_fini() {
      tmp_stream = (char *)t_device[dev]->stream;
    }
    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
    }
    /* free device memory */
    hl_free_mem_device(t_device[dev]->gpu_mem);
    hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
  }
  free(tmp);
@@ -308,7 +235,7 @@ void hl_set_device(int device) {
  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
      << "Device: " << device << " is not specified in startup.";
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
  /* switch thread stream */
  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -336,7 +263,7 @@ void hl_set_device(int device) {
 int hl_get_device() {
  int device;
-  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  CHECK_CUDA(cudaGetDevice(&device));
  return device;
 }
@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
  void *dest_d;
  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
  return dest_d;
 }
@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
 void hl_free_mem_device(void *dest_d) {
  CHECK_NOTNULL(dest_d);
-  cudaError_t err = dynload::cudaFree(dest_d);
+  cudaError_t err = cudaFree(dest_d);
  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
      << hl_get_device_error_string();
 }
@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
  void *dest_h;
  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
-      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
  return dest_h;
 }
@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
 void hl_free_mem_host(void *dest_h) {
  CHECK_NOTNULL(dest_h);
-  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  cudaError_t err = cudaFreeHost(dest_h);
  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
      << hl_get_device_error_string();
 }
@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
  }
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
-  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 }
 void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
 }
 void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
  }
  CHECK_NOTNULL(src_h);
  CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
  }
  CHECK_NOTNULL(dest_h);
  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
  }
  CHECK_NOTNULL(dest_d);
  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
-      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
  CHECK_LT(stream, HPPL_STREAM_END);
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
-      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 void hl_start() {
@@ -439,8 +363,7 @@ void hl_start() {
 bool hl_device_can_access_peer(int device, int peerDevice) {
  int canAccessPeer;
-  CHECK_CUDA(
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
-      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
  if (canAccessPeer == 1) {
    return true;
@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
 }
 void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
  if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    dynload::cudaGetLastError();
+    cudaGetLastError();
  } else {
    CHECK_CUDA(err);
  }
@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
  int device = device_prop->device;
  global_device_resources device_res = device_prop->device_resources;
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
  /* device properties */
-  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
  device_prop->major = cu_prop.major;
  device_prop->minor = cu_prop.minor;
@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
  /* create device stream */
  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
  }
  /* cublas init */
@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
  pthread_mutex_init(device_res->gen_mutex, NULL);
-  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 int hl_get_cuda_version() { return g_cuda_lib_version; }
 void hl_create_thread_resources(int device,
                                thread_device_resources device_res) {
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
  /* create thread stream */
  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
  }
  /* allocation device memory */
@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
  /* allocation host memory */
  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
-  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
 }
 void hl_specify_devices_start(int *device, int number) {
  if (hl_start_flag) return;
  /* 1. get the number of devices */
-  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
  if (device == NULL) {
    number = g_system_device_num;
@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
                                    << ": the parameter stream is error.";
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
 }
 void hl_create_event(hl_event_t *event) {
@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
  struct _hl_event_st *st_event =
      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
-  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
  *event = st_event;
 }
@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
  CHECK_NOTNULL(start);
  CHECK_NOTNULL(end);
-  CHECK_CUDA(
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
-      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
  return time;
 }
@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
                                    << ": the parameter stream is error.";
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
 }
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
                                    << ": the parameter stream is error.";
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 void hl_destroy_event(hl_event_t event) {
  CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
  free(event);
  event = NULL;
@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
 void hl_event_synchronize(hl_event_t event) {
  CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
 }
 void hl_get_device_name(char *name, int len, int device) {
@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
  *minor = g_device[device]->minor;
 }
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
 const char *hl_get_device_error_string() {
-  cudaError_t err = dynload::cudaGetLastError();
+  cudaError_t err = cudaGetLastError();
-  return dynload::cudaGetErrorString(err);
+  return cudaGetErrorString(err);
 }
 const char *hl_get_device_error_string(size_t err) {
-  return dynload::cudaGetErrorString((cudaError_t)err);
+  return cudaGetErrorString((cudaError_t)err);
 }
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  cudaError_t err = cudaEventQuery(event->cu_event);
  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
  if (cudaErrorNotReady == err) {
@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
  return true;
 }
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_USE_DSO
-#include <cuda_runtime.h>
-#include <mutex>
-#include "hl_dso_loader.h"
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    __type operator()(Args... args) {                                          \
-      typedef __type (*cudartFunc)(Args...);                                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)          \
-  __macro(cudaLaunch, cudaError_t)          \
-  __macro(cudaSetupArgument, cudaError_t)   \
-  __macro(cudaConfigureCall, cudaError_t)   \
-  __macro(__cudaRegisterFatBinary, void**)  \
-  __macro(__cudaUnregisterFatBinary, void)  \
-  __macro(__cudaRegisterFunction, void)     \
-  __macro(__cudaRegisterVar, void)          \
-  __macro(__cudaRegisterManagedVar, void)   \
-  __macro(__cudaInitModule, char)           \
-  __macro(__cudaRegisterTexture, void)      \
-  __macro(__cudaRegisterSurface, void)
-// clang-format on
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-#undef CUDA_ROUNTINE_EACH
-} /* namespace dynload */
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
-                                                dim3 gridDim,
-                                                dim3 blockDim,
-                                                void **args,
-                                                size_t sharedMem,
-                                                cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(
-      func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  return dynload::cudaLaunch(func);
-}
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                 size_t size,
-                                                 size_t offset) {
-  return dynload::cudaSetupArgument(arg, size, offset);
-}
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
-                                                 dim3 blockDim,
-                                                 size_t sharedMem,
-                                                 cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-extern "C" {
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
-                                      const char *hostFun,
-                                      char *deviceFun,
-                                      const char *deviceName,
-                                      int thread_limit,
-                                      uint3 *tid,
-                                      uint3 *bid,
-                                      dim3 *bDim,
-                                      dim3 *gDim,
-                                      int *wSize) {
-  return dynload::__cudaRegisterFunction(fatCubinHandle,
-                                         hostFun,
-                                         deviceFun,
-                                         deviceName,
-                                         thread_limit,
-                                         tid,
-                                         bid,
-                                         bDim,
-                                         gDim,
-                                         wSize);
-}
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
-                                 char *hostVar,
-                                 char *deviceAddress,
-                                 const char *deviceName,
-                                 int ext,
-                                 int size,
-                                 int constant,
-                                 int global) {
-  return dynload::__cudaRegisterVar(fatCubinHandle,
-                                    hostVar,
-                                    deviceAddress,
-                                    deviceName,
-                                    ext,
-                                    size,
-                                    constant,
-                                    global);
-}
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
-                                               void **hostVarPtrAddress,
-                                               char *deviceAddress,
-                                               const char *deviceName,
-                                               int ext,
-                                               int size,
-                                               int constant,
-                                               int global) {
-  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
-                                           hostVarPtrAddress,
-                                           deviceAddress,
-                                           deviceName,
-                                           ext,
-                                           size,
-                                           constant,
-                                           global);
-}
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  return dynload::__cudaInitModule(fatCubinHandle);
-}
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
-                                     const struct textureReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int norm,
-                                     int ext) {
-  return dynload::__cudaRegisterTexture(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
-                                     const struct surfaceReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int ext) {
-  return dynload::__cudaRegisterSurface(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-} /* extern "C" */
-#endif
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
 DEFINE_string(cuda_dir,
              "",
              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "libcudart can not be specified by cuda_dir, since some "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
-              "build-in function in cudart already ran before main entry). "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
 #endif
 }
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);

--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -15,17 +15,32 @@ limitations under the License. */
 #include <glog/logging.h>
 #include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
 namespace paddle {
 const SequenceArg& BufferArg::sequence() const {
-  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
  return dynamic_cast<const SequenceArg&>(*this);
 }
 const SparseMatrixArg& BufferArg::sparse() const {
-  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
  return dynamic_cast<const SparseMatrixArg&>(*this);
 }
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 }  // namespace paddle
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -18,17 +18,16 @@ limitations under the License. */
 #include "TensorShape.h"
 #include "TensorType.h"
-#include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 namespace paddle {
 enum BufferType {
-  TENSOR_NORMAL = 0,
+  TENSOR_UNKNOWN = 0,
-  TENSOR_SEQUENCE_ID = 1,
+  TENSOR_NORMAL = 1,
-  TENSOR_SEQUENCE_DATA = 2,
+  TENSOR_SEQUENCE_ID = 2,
-  TENSOR_SPARSE = 3
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
 };
 enum SparseDataType {
@@ -41,7 +40,6 @@ enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
 /**
 * \brief BufferArg used as the argument type of Function.
@@ -52,6 +50,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
 * 3. SequenceArg for a Buffer of sequence data.
 * 4. SparseMatrixArg for a Buffer of sparse matrix.
 *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
 * There is an ArgType property for the BufferArg used as Function Output.
 * Whether the result of the Function calculation is assigned to the
 * output Buffer or added to the output Buffer is determined by the
@@ -73,6 +76,14 @@ public:
  ArgType getArgType() const { return argType_; }
 public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr),
+        valueType_(valueType),
+        shape_(shape),
+        argType_(argType) {}
  BufferArg(void* buf,
            ValueType valueType,
            const TensorShape& shape,
@@ -88,6 +99,7 @@ public:
        valueType_(DataType<real>::value),
        shape_(2),
        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
    shape_.setDim(0, matrix.getHeight());
    shape_.setDim(1, matrix.getWidth());
  }
@@ -100,6 +112,7 @@ public:
        valueType_(DataType<real>::value),
        shape_(shape),
        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
  }
@@ -109,6 +122,7 @@ public:
        valueType_(DataType<real>::value),
        shape_(1),
        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
    shape_.setDim(0, vector.getSize());
  }
@@ -118,6 +132,7 @@ public:
        valueType_(VALUE_TYPE_INT32),
        shape_(1),
        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
    shape_.setDim(0, vector.getSize());
  }
@@ -152,6 +167,8 @@ public:
  ValueType valueType() const { return valueType_; }
  BufferType bufferType() const { return bufferType_; }
  const TensorShape& shape() const { return shape_; }
+  bool isSparse() const { return (TENSOR_SPARSE == bufferType_); }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
  const SequenceArg& sequence() const;
  const SparseMatrixArg& sparse() const;
@@ -160,8 +177,8 @@ protected:
  void* buf_;
  ValueType valueType_;
  TensorShape shape_;
-  BufferType bufferType_;
+  BufferType bufferType_{TENSOR_UNKNOWN};
-  ArgType argType_ = UNSPECIFIED;
+  ArgType argType_{UNSPECIFIED};
  // leading dimensions. The size is dims_.size()
  // Dims lds_;
 };
@@ -172,15 +189,24 @@ protected:
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    CHECK_EQ(shape_.ndims(), (size_t)1);
+    CHECK_GT(shape_[0], 1);
+    numSeqs_ = shape_[0] - 1;
+  }
  SequenceIdArg(void* buf,
                const TensorShape& shape,
                ArgType argType = UNSPECIFIED)
      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
    CHECK_EQ(shape_.ndims(), (size_t)1);
    numSeqs_ = shape_[0] - 1;
  }
  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
    numSeqs_ = shape_[0] - 1;
  }
@@ -192,26 +218,41 @@ private:
  size_t numSeqs_;
 };
-// sequence data
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
 class SequenceArg : public BufferArg {
 public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType), startPositions_(TensorShape()) {}
  SequenceArg(void* buf,
              ValueType valueType,
              const TensorShape& shape,
              const SequenceIdArg& startPositions,
              ArgType argType = UNSPECIFIED)
      : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {}
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
  SequenceArg(const Matrix& matrix,
              const IVector& vector,
              ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {}
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
  ~SequenceArg() {}
  void* getIdBuf() const { return startPositions_.data(); }
  size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
 private:
  SequenceIdArg startPositions_;
@@ -237,6 +278,7 @@ public:
        nnz_(nnz),
        format_(format),
        type_(type) {
+    bufferType_ = TENSOR_SPARSE;
    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
    CHECK_EQ(shape_.ndims(), (size_t)2);
    CHECK_EQ(row_.shape().ndims(), (size_t)1);
@@ -248,15 +290,9 @@ public:
    }
  }
-  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-      : BufferArg(sparse, argType),
-        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
-  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED)
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
-      : BufferArg(sparse, argType),
-        row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-        col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
  ~SparseMatrixArg() {}

--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 #include "BufferArg.h"
 #include <gtest/gtest.h>
-#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
 namespace paddle {
@@ -36,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) {
  EXPECT_EQ(buffer.numSeqs(), 9);
 }
-TEST(BufferTest, asArgument) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  VectorPtr vector = Vector::create(100, false);
-  CpuSparseMatrix sparse(200, 300, 50);
-  // prepare arguments
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  argments.addArg(*vector);
-  argments.addArg(sparse);
-  // function
-  auto function = [=](const BufferArgs& inputs) {
-    EXPECT_EQ(inputs.size(), 3);
-    // check inputs[0]
-    EXPECT_EQ(inputs[0].shape().ndims(), 2);
-    EXPECT_EQ(inputs[0].shape()[0], 100);
-    EXPECT_EQ(inputs[0].shape()[1], 200);
-    EXPECT_EQ(inputs[0].data(), matrix->getData());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
-              matrix->getHeight());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
-              matrix->getWidth());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-    // check inputs[1]
-    EXPECT_EQ(inputs[1].shape().ndims(), 1);
-    EXPECT_EQ(inputs[1].shape()[0], 100);
-    EXPECT_EQ(inputs[1].data(), vector->getData());
-    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-    // check inputs[2]
-    EXPECT_EQ(inputs[2].shape().ndims(), 2);
-    EXPECT_EQ(inputs[2].shape()[0], 200);
-    EXPECT_EQ(inputs[2].shape()[1], 300);
-    EXPECT_EQ(inputs[2].data(), sparse.getData());
-    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
-    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
-  };
-  // call function
-  function(argments);
-}
 }  // namespace paddle
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -19,12 +19,12 @@ if(WITH_TESTING)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(CrossMapNormalOpTest)
    add_simple_unittest(TensorShapeTest)
    add_simple_unittest(TensorTypeTest)
    add_simple_unittest(BufferArgTest)
    add_simple_unittest(FunctionTest)
-    # add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(ContextProjectionOpTest)
 endif()
 endif()

--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -17,7 +17,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 namespace paddle {
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
 template <>
 void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
                                               const CpuMatrix& input_mat,
@@ -70,10 +73,30 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 }
 /**
- * \param inputs[0] input value.
+ * Paddle Function for Context Projection Forward.
- * \param inputs[1] input weight.
+ * Calculate the output layer value sequence after context projection.
- * \param inputs[2] input sequence.
+ *
- * \param outputs[0] output value.
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
 */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -85,28 +108,37 @@ public:
  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
+    CHECK(1 == inputs.size() || 2 == inputs.size());
    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.getSequenceId().shape().ndims(), (size_t)1);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    if (2 == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    }
    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    /// dim of input == dim of weight
+    if (2 == inputs.size()) {
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
-    auto out_mat = outputs[0].matrix<Device>();
+    auto out_mat = out_seq.matrix<Device>();
-    auto in_mat = inputs[0].matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
-    auto w_mat = !inputs[1].data()
+    const auto w_mat =
-                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+        (2 == inputs.size())
-                     : inputs[1].matrix<Device>();
+            ? inputs[1].matrix<Device>()
-    auto seq_vec = inputs[2].vector<int, Device>();
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
    ContextProjectionForward<Device>(out_mat,
                                     in_mat,
                                     w_mat,
@@ -122,8 +154,12 @@ private:
  size_t begin_pad_;
 };
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
                                                CpuMatrix& in_grad_mat,
                                                CpuMatrix& w_grad_mat,
                                                const CpuIVector& seq_vec,
@@ -146,7 +182,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
        int64_t pad_size =
            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
          sub->addAtOffset(*mat, j * input_dim);
        }
@@ -157,8 +194,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
        int64_t pad_size =
            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
        if (is_padding && w_grad_mat) {
-          MatrixPtr mat =
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
-              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
          MatrixPtr sub = w_grad_mat.subMatrix(
              begin_pad + context_start + j - pad_size, pad_size);
          sub->addAtOffset(*mat, j * input_dim);
@@ -169,17 +206,22 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
      if (end <= begin) continue;
      if (!in_grad_mat) continue;
      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
      src->addAtOffset(*dst, j * input_dim);
    }
  }
 }
 /**
- * \param inputs[0] input grad.
+ * Context Projection Backward Function.
- * \param inputs[1] weight grad.
+ * Update the weight gradient and input layer gradient with backprop
- * \param inputs[2] input sequence.
+ *
- * \param outputs[0] output value.
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
 */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -193,32 +235,36 @@ public:
  }
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
+    CHECK_EQ((size_t)1, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
+    CHECK_EQ((size_t)2, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), (size_t)1);
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), (size_t)1);
+    CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
-    CHECK(outputs[0].data() && inputs[2].data());
+    /// dim of input grad == dim of weight
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    /// input and output grad has the same batch_size
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
-    /// dim of input == dim of weight
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    const auto out_grad_mat = in_seq.matrix<Device>();
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto out_grad_mat = outputs[0].matrix<Device>();
    auto in_grad_mat =
-        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[0].matrix<Device>();
+                        : out_seq.matrix<Device>();
-    auto w_grad_mat = !inputs[1].data()
+    auto w_grad_mat = !outputs[1].data()
                          ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[1].matrix<Device>();
+                          : outputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
    ContextProjectionBackward<Device>(out_grad_mat,
                                      in_grad_mat,
                                      w_grad_mat,
@@ -238,11 +284,16 @@ private:
  size_t total_pad_;
 };
-#if 0
 /**
- * \param inputs[0] input grad.
+ * Context Projection Backward Data Function
- * \param inputs[1] input sequence.
+ * Update input layer grad
- * \param outputs[0] output grad.
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
 */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -252,32 +303,30 @@ public:
    context_start_ = config.get<int>("context_start");
  }
-  void calc(const Arguments& inputs,
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-            const Arguments& outputs,
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+        << "SequenceArg required here";
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(static_cast<int>(out_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+    const auto out_grad_mat = in_seq.matrix<Device>();
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+    auto in_grad_mat = out_seq.matrix<Device>();
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
-    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
+    ContextProjectionBackwardData<Device>(
-                                          in_grad_mat.get(),
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
-                                          seq_vec,
-                                          context_length_,
-                                          context_start_);
  }
 private:
@@ -286,9 +335,14 @@ private:
 };
 /**
- * \param inputs[0] weight grad.
+ * Context Projection Backward Weight Function
- * \param inputs[1] input sequence.
+ * Update weight grad by backprop
- * \param outputs[0] output grad.
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
 */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -300,28 +354,25 @@ public:
    total_pad_ = config.get<size_t>("total_pad");
  }
-  void calc(const Arguments& inputs,
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-            const Arguments& outputs,
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
-    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
-                                            w_grad_mat.get(),
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
                                            seq_vec,
                                            context_length_,
                                            context_start_,
@@ -335,7 +386,6 @@ private:
  size_t begin_pad_;
  size_t total_pad_;
 };
-#endif
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                    CPU,
@@ -350,7 +400,6 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                    GPU,
                    ContextProjectionBackwardFunc);
-#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                    GPU,
                    ContextProjectionBackwardDataFunc);
@@ -358,5 +407,4 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                    GPU,
                    ContextProjectionBackwardWeightFunc);
 #endif
-#endif
 }  // namespace paddle
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -21,14 +21,14 @@ namespace paddle {
 /**
 * \brief   Context Projection Forward.
 *
- * \param[out]  outputs           output data.
+ * \param[in/out]  outputs           output data.
- * \param[in]   input             input data.
+ * \param[in]      input             input data.
- * \param[in]   weight            input weight.
+ * \param[in]      weight            input weight.
- * \param[in]   sequence          input data.
+ * \param[in]      sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]      context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
+ * \param[in]      context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
+ * \param[in]      begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
+ * \param[in]      is_padding        whether padding 0 or not.
 *
 */
 template <DeviceType DType>
@@ -56,7 +56,7 @@ void ContextProjectionForward(
 */
 template <DeviceType DType>
 void ContextProjectionBackward(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
    typename Tensor<real, DType>::Matrix& in_grad,
    typename Tensor<real, DType>::Matrix& w_grad,
    const typename Tensor<int, DType>::Vector& seq_vec,
@@ -68,7 +68,7 @@ void ContextProjectionBackward(
 template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
    typename Tensor<real, DType>::Matrix& in_grad,
    const typename Tensor<int, DType>::Vector& sequence,
    size_t context_length,
@@ -76,7 +76,7 @@ void ContextProjectionBackwardData(
 template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
    typename Tensor<real, DType>::Matrix& w_grad,
    const typename Tensor<int, DType>::Vector& seq_vec,
    size_t context_length,

--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
                                begin_pad);
 }
-__global__ void KeContextProjectionBackwardData(real* out_grad,
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
                                                const int* sequence,
                                                real* in_grad,
-                                                int input_dim,
+                                                size_t input_dim,
                                                int context_length,
                                                int context_start) {
  int idx = threadIdx.x;
@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
  real value = 0;
  int instances = seq_end - seq_start + context_length - 1;
-  out_grad += seq_start * input_dim * context_length;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
  in_grad += seq_start * input_dim;
  for (int k = 0; k <= input_dim / block_size; k++) {
    if (idx < input_dim) {
@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
        int outx = (i - context_length) < 0 ? i : (context_length - 1);
        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
        real* output_r =
-          out_grad + outy * input_dim * context_length + outx * input_dim;
+          out + outy * input_dim * context_length + outx * input_dim;
        for (int j = outy; j < seq_end - seq_start; j++) {
          value += output_r[idx];
          if (j - outy == outx) break;
@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
 * @param[in]   context_start    context start.
 *
 */
-void hl_context_projection_backward_data(real* out_grad,
+void hl_context_projection_backward_data(const real* out_grad,
                                         const int* sequence,
                                         real* input_grad,
                                         size_t num_sequences,
@@ -216,7 +217,7 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                    GpuMatrix& in_grad,
                                                    const GpuIVector& sequence,
                                                    size_t context_length,
@@ -231,7 +232,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
 }
 template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                  const int* sequence,
                                                  real* w_grad,
                                                  int num_sequences,
@@ -254,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
      int seq_start = sequence[seqId];
      int seq_end = sequence[seqId+1];
-      output_r = out_grad + seq_start * w_dim * context_length;
+      output_r = const_cast<real*>(out_grad)
+                    + seq_start * w_dim * context_length;
      if (context_start < 0) {
        if (padId + context_start < 0) {
@@ -318,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
 * beginning.
 *
 */
-void hl_context_projection_backward_weight(real* out_grad,
+void hl_context_projection_backward_weight(const real* out_grad,
                                           const int* sequence,
                                           real* w_grad,
                                           size_t num_sequences,
@@ -346,7 +348,7 @@ void hl_context_projection_backward_weight(real* out_grad,
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix& out_grad,
+        const GpuMatrix& out_grad,
        GpuMatrix& w_grad,
        const GpuIVector& seq_vec,
        size_t context_length,
@@ -365,7 +367,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                GpuMatrix& in_grad,
                                                GpuMatrix& w_grad,
                                                const GpuIVector& sequence,

--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -56,22 +56,25 @@ void testMatrixProjectionForward(int context_start,
  cpu_out.randomizeUniform();
  gpu_out.copyFrom(cpu_out);
-  compare.getCpuFunction()->calc(
+  BufferArgs cpu_inputs;
-      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+  BufferArgs cpu_outputs;
-       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+  cpu_inputs.addArg(cpu_in, *cpu_seq);
-              Dims{pad, input_dim}),
+  if (cpu_weight) {
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+    cpu_inputs.addArg(*cpu_weight, *cpu_seq);
-              Dims{cpu_seq->getSize()})},
+  }
-      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+  cpu_outputs.addArg(cpu_out, *cpu_seq, ADD_TO);
-      {});
-  compare.getGpuFunction()->calc(
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
-      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+  BufferArgs gpu_inputs;
-              Dims{pad, input_dim}),
+  BufferArgs gpu_outputs;
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+  gpu_inputs.addArg(gpu_in, *gpu_seq);
-              Dims{gpu_seq->getSize()})},
+  if (gpu_weight) {
-      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+    gpu_inputs.addArg(*gpu_weight, *gpu_seq);
-      {});
+  }
+  gpu_outputs.addArg(gpu_out, *gpu_seq, ADD_TO);
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
  autotest::TensorCheckEqual(cpu_out, gpu_out);
 }
@@ -117,25 +120,23 @@ void testMatrixProjectionBackward(int context_start,
    gpu_w_grad->copyFrom(*cpu_w_grad);
  }
-  compare.getCpuFunction()->calc(
+  BufferArgs cpu_inputs;
-      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
+  BufferArgs cpu_outputs;
-       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
+  cpu_inputs.addArg(cpu_out_grad, *cpu_seq);
-              Dims{pad, input_dim}),
+  cpu_outputs.addArg(cpu_in_grad, *cpu_seq, ADD_TO);
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+  cpu_outputs.addArg(
-              Dims{cpu_seq->getSize()})},
+      cpu_w_grad ? *cpu_w_grad : CpuMatrix(nullptr, 0, input_dim), ADD_TO);
-      {Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
-      {});
+  BufferArgs gpu_inputs;
-  compare.getGpuFunction()->calc(
+  BufferArgs gpu_outputs;
-      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
+  gpu_inputs.addArg(gpu_out_grad, *gpu_seq);
-       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
+  gpu_outputs.addArg(gpu_in_grad, *gpu_seq, ADD_TO);
-              Dims{pad, input_dim}),
+  gpu_outputs.addArg(
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+      gpu_w_grad ? *gpu_w_grad : GpuMatrix(nullptr, 0, input_dim), ADD_TO);
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out_grad.getData(),
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
-              Dims{batch_size, input_dim * context_length})},
-      {});
  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
  if (is_padding) {

--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -188,8 +188,13 @@ public:
    CHECK(inputs[0].shape() == inputs[3].shape());
    CHECK(inputs[0].shape() == outputs[0].shape());
-    // TODO(hedaoyuan): need support ASSIGN_TO mode.
+    if (outputs[0].getArgType() != ADD_TO) {
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
    size_t samples = inputs[0].shape()[0];
    size_t channels = inputs[0].shape()[1];

--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -27,15 +27,19 @@ TEST(CrossMapNormal, real) {
                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                    << " size=" << size;
-            FunctionCompare compare("CrossMapNormal",
+            // init Test object
-                                    FuncConfig()
+            FunctionCompare test("CrossMapNormal",
-                                        .set("size", size)
+                                 FuncConfig()
-                                        .set("scale", (real)1.5)
+                                     .set("size", size)
-                                        .set("pow", (real)0.5));
+                                     .set("scale", (real)1.5)
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
+                                     .set("pow", (real)0.5));
-            compare.cmpWithArg({Tensor(nullptr, dims)},
+            // prepare input arguments
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-                               {});
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
          }
        }
      }
@@ -53,18 +57,19 @@ TEST(CrossMapNormalGrad, real) {
                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                    << " size=" << size;
-            FunctionCompare compare("CrossMapNormalGrad",
+            FunctionCompare test("CrossMapNormalGrad",
-                                    FuncConfig()
+                                 FuncConfig()
-                                        .set("size", size)
+                                     .set("size", size)
-                                        .set("scale", (real)1.5)
+                                     .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
+                                     .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                                Tensor(nullptr, dims),
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                                Tensor(nullptr, dims),
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                                Tensor(nullptr, dims)},
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                               {Tensor(nullptr, dims)},
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
-                               {});
+            // run Function
+            test.run();
          }
        }
      }

--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -79,15 +79,25 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
 void BufferArgs::addArg(const Matrix& arg,
                        const TensorShape& shape,
                        ArgType argType) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
 }
 void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
+}
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
 }
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;

--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -50,19 +50,44 @@ protected:
 * Argument type for Function::calc().
 * A BufferArgs contains a set of BufferArg,
 * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
 */
 class BufferArgs {
 public:
  BufferArgs() {}
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
  size_t size() const { return args_.size(); }
  // add argument into BufferArgs
  // Tensor can be Matrix, Vector, IVector.
  // For inputs, do not need argType.
  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  template <typename Tensor>
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
-  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
-    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+    addArg(*_args_.back());
+  }
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
  }
  // Add arg into BufferArgs and reshape the arg.
@@ -77,20 +102,37 @@ public:
  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
  // get argument
  const BufferArg& operator[](size_t num) const {
    CHECK_LT(num, args_.size());
    return *args_[num];
  }
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
 private:
-  std::vector<BufferArgPtr> args_;
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
 };
 /**
 * \brief Base class for Function.
 * The basic Function implementation requires override init and calc interfaces.
 *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
 * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
 * and ADD_TO.
 * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation

--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "Function.h"
 #include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
 namespace paddle {
@@ -56,4 +57,110 @@ TEST(Function, BufferArgs) {
  Function<DEVICE_TYPE_GPU>(gpuArgments);
 }
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
+  check(inputs[0]);
+}
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
 }  // namespace paddle
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -15,95 +15,186 @@ limitations under the License. */
 #include "Function.h"
 #include "paddle/math/Vector.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 namespace paddle {
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
  FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
+    cpuFunc_->init(config);
-    gpu->init(config);
+    gpuFunc_->init(config);
+  }
+  ~FunctionCompare() {}
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
  }
-  void cmpWithArg(const Arguments& inputs,
+  void addInputs(const SequenceArg& input) {
-                  const Arguments& outputs,
+    size_t batchSize = input.shape()[0];
-                  const Arguments& inouts) {
+    size_t numSeqs = batchSize / 10 + 1;
-    // init cpu and gpu arguments
-    auto initArgs = [=](
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
-      for (const auto arg : inArgs) {
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
-        size_t size = sizeof(real);
-        for (const auto dim : arg.dims_) {
+    TensorShape seqsId({numSeqs + 1});
-          size *= dim;
+    // void* cpuBuffer = cpuMemory_.back()->getBuf();
-        }
+    // void* gpuBuffer = gpuMemory_.back()->getBuf();
-        if (arg.getData()) {
-          // todo(tianbing), waste unnecessary mem here
+    size_t size =
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+        input.shape().getElements() * sizeOfValuType(input.valueType());
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          // already init outside
+    // TODO: need be implemented.
-        } else {
+  }
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+  void run() {
-          cpuArgs.emplace_back(
+    // prepare cpu/gpu arguments
-              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
+    initInputs();
-          gpuArgs.emplace_back(
-              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+    // function calculate
-          // will use an api to refactor this code.
+    auto callFunction = [](FunctionBase* function,
-          CpuVector cpuVector(size / sizeof(real),
+                           std::vector<BufferArgPtr>& inputs,
-                              (real*)cpuArgs.back().getData());
+                           std::vector<BufferArgPtr>& outputs) {
-          GpuVector gpuVector(size / sizeof(real),
+      BufferArgs inArgs;
-                              (real*)gpuArgs.back().getData());
+      BufferArgs outArgs;
-          cpuVector.uniform(0.001, 1);
+      for (auto arg : inputs) {
-          gpuVector.copyFrom(cpuVector);
+        inArgs.addArg(*arg);
-        }
      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
    };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
-    // function calculate
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
    // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
+    compareOutputs();
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
+  }
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
-          size *= dim;
-        }
+protected:
-        CpuVector cpuVector(size, (real*)cpu.getData());
+  void initInputs() {
-        GpuVector gpuVector(size, (real*)gpu.getData());
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      initArg(*cpuInputs_[i]);
-        autotest::TensorCheckErr(cpuVector, gpuVector);
-      }
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-    };
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
-    checkArgs(cpuOutputs, gpuOutputs);
+                          (real*)cpuInputs_[i]->data());
-    checkArgs(cpuInouts, gpuInouts);
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      auto cpu = cpuOutputs_[i];
+      auto gpu = gpuOutputs_[i];
+      CpuVector cpuVector(cpu->shape().getElements(), (real*)cpu->data());
+      GpuVector gpuVector(cpu->shape().getElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
  }
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = uniformRandom(
+                    std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+                1;
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
 protected:
-  std::shared_ptr<FunctionBase> cpu;
+  std::shared_ptr<FunctionBase> cpuFunc_;
-  std::shared_ptr<FunctionBase> gpu;
+  std::shared_ptr<FunctionBase> gpuFunc_;
-  std::vector<CpuMemHandlePtr> cpuMemory;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
-  std::vector<GpuMemHandlePtr> gpuMemory;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
-  Arguments cpuInputs;
+  std::vector<BufferArgPtr> cpuInputs_;
-  Arguments cpuOutputs;
+  std::vector<BufferArgPtr> cpuOutputs_;
-  Arguments cpuInouts;
+  std::vector<BufferArgPtr> gpuInputs_;
-  Arguments gpuInputs;
+  std::vector<BufferArgPtr> gpuOutputs_;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -118,16 +118,15 @@ void ContextProjection::forward() {
  /// first use state_, otherwise use weight_(padding false === w nullptr)
  auto w_ptr =
      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  auto start_pos = in_->sequenceStartPositions;
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
  BufferArgs inputs;
  BufferArgs outputs;
-  inputs.addArg(*in_->value);
+  inputs.addArg(*in_->value, *start_pos);
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+  if (w_ptr) {
-                          w_ptr ? w_ptr->getHeight() : 0,
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
-                          input_dim));
+                  *start_pos);
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  }
-  outputs.addArg(*out_->value, ADD_TO);
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
  forward_[0]->calc(inputs, outputs);
  if (state_ && config_.context_start() < 0) {
@@ -166,13 +165,16 @@ void ContextProjection::backward(const UpdateCallback& callback) {
  BufferArgs inputs;
  BufferArgs outputs;
-  inputs.addArg(CpuMatrix(
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
-      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
+  outputs.addArg(
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+      CpuMatrix(
-                          w_ptr ? w_ptr->getHeight() : 0,
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
-                          input_dim));
+      *in_->sequenceStartPositions->getVector(useGpu_),
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+      ADD_TO);
-  outputs.addArg(*out_->grad, ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
  backward_[0]->calc(inputs, outputs);
  if (config_.trainable_padding()) {

--- a/paddle/math/RowBuffer.h
+++ b/paddle/math/RowBuffer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/utils/Util.h"
+namespace paddle {
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return const_cast<real*>(rowStore_.data() + row * width_);
+    }
+  }
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() { rowStore_.clear(); }
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <string.h>
 #include <algorithm>
 #include "Matrix.h"
+#include "RowBuffer.h"
 #include "paddle/utils/Util.h"
 DECLARE_bool(allow_inefficient_sparse_update);
@@ -45,12 +46,9 @@ public:
                     IndexDictPtr indexDictHandle = nullptr,
                     bool trans = false)
      : CpuMatrix(nullptr, height, width, trans),
-        storeMat_(dataHandle,
-                  dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width,
-                  trans),
        indexDictHandle_(indexDictHandle) {
    init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
  }
  virtual ~SparseRowCpuMatrix() {}
@@ -71,25 +69,16 @@ public:
   *
   *  @param row row id in local storage
   */
-  real* getLocalRow(size_t row) {
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
-    if (storeMat_.getData()) return storeMat_.rowBuf(row);
-    if (rowStore_.size() <= row * width_) {
-      rowStore_.resize((row + 1) * width_);
-    }
-    return rowStore_.data() + row * width_;
-  }
  /**
-   *  reserve the storage for rows according to current size of indexDictHandle.
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
   *
   *  This is only used when SparseRowCpuMatrix is constructed with
   *  indexDictHandle.
   */
-  void reserveStore() {
+  void reserveStore() { buf_->resize(localIndices_->size()); }
-    if (!storeMat_.getData() && !localIndices_->empty()) {
-      rowStore_.resize(localIndices_->size() * width_);
-    }
-  }
  // row is the row id in the original matrix
  virtual real* getRowBuf(size_t row) { return getRow(row); }
@@ -117,7 +106,8 @@ public:
   *
   * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
   *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter matrix,
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
   * store the time that each weight row last updated.
   *
   * Time is batchId, currentTime is current batchId.
@@ -176,8 +166,7 @@ public:
 protected:
  template <typename Func>
  void apply(Func f) {
-    real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
+    f(buf_->data(), localIndices_->size() * width_);
-    f(data, localIndices_->size() * width_);
  }
  void init(size_t height, size_t width);
@@ -188,25 +177,24 @@ protected:
      globalIndices_[id] = kUnusedId_;
    }
    localIndices_->clear();
-    rowStore_.clear();
+    buf_->clear();
  }
  inline void checkStoreSize() {
-    if (storeMat_.getData()) {
+    if (buf_->isAutoGrowth()) {
-      CHECK_LE(localIndices_->size(), storeMat_.getHeight());
+      if (buf_->getRowCount() > 0.5 * height_) {
-    } else if (!FLAGS_allow_inefficient_sparse_update) {
-      if (localIndices_->size() > 0.5 * height_) {
        LOG(WARNING)
            << "There are more than 0.5*height (" << localIndices_->size()
            << ") rows are used for sparse "
            << "update, which is not efficient. Considering not use "
            << "sparse_update or set --allow_inefficient_sparse_update=true";
      }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
    }
  }
-  CpuMatrix storeMat_;
+  std::unique_ptr<RowBuffer> buf_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
  IndexDictPtr indexDictHandle_;
  std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
  unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
+add_simple_unittest(test_RowBuffer)
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare

--- a/paddle/math/tests/test_RowBuffer.cpp
+++ b/paddle/math/tests/test_RowBuffer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/math/RowBuffer.h"
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+  ASSERT_EQ(3, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
    BaseClient.cpp
    ParameterClient2.cpp
    ParameterServer2.cpp
-    SparseParameterDistribution.cpp)
+    SparseParameterDistribution.cpp
+    ParameterServerController.cpp)
 set(PSERVER_HEADERS
    BaseClient.h
    ParameterClient2.h
    ParameterServer2.h
-    SparseParameterDistribution.h)
+    SparseParameterDistribution.h
+    ParameterServerController.h)
 add_library(paddle_pserver STATIC
    ${PSERVER_SOURCES})

--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
-#include "paddle/utils/StringUtil.h"
+#include "ParameterServerController.h"
-#include "paddle/utils/Util.h"
-#include "ParameterServer2.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/Flags.h"
 using namespace paddle;  // NOLINT
 int main(int argc, char** argv) {
  initMain(argc, argv);
-  std::vector<std::string> devices;
+  std::unique_ptr<ParameterServerController> parameterServerPtr(
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+      paddle::ParameterServerController::createFromGflags());
+  parameterServerPtr->start();
-  // round robin to loadbalance RDMA server ENGINE
+  parameterServerPtr->wait();
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-  if (FLAGS_nics.empty()) {
-    pservers.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
-        pservers[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      }
-      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                 << FLAGS_port + i;
-      LOG(INFO) << "pserver started : " << FLAGS_port + i;
-      pservers[i]->start();
-    }
-  } else {
-    str::split(FLAGS_nics, ',', &devices);
-    pservers.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-        }
-        CHECK(pservers[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << devices[j] << ":"
-                  << FLAGS_port + i;
-        pservers[i * devices.size() + j]->start();
-      }
-    }
-  }
-  for (auto& pserver : pservers) {
-    pserver->join();
-  }
  return 0;
 }
--- a/paddle/pserver/ParameterServerController.cpp
+++ b/paddle/pserver/ParameterServerController.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ParameterServerController.h"
+namespace paddle {
+ParameterServerController::ParameterServerController(
+    const ParameterServerConfig& config) {
+  // round robin to load balance RDMA server ENGINE
+  std::vector<std::string> devices;
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  int numPorts = config.ports_num() + config.ports_num_for_sparse();
+  if (config.nics().empty()) {
+    parameterServers_.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (config.rdma_tcp() == "rdma") {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i));
+      }
+      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
+                                             "server on port "
+                                          << config.port() + i;
+    }
+  } else {
+    str::split(config.nics(), ',', &devices);
+    parameterServers_.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (config.rdma_tcp() == "rdma") {
+          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          parameterServers_[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
+        }
+        CHECK(parameterServers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server with device " << devices[j]
+            << config.port() + i;
+      }
+    }
+  }
+}
+ParameterServerController::~ParameterServerController() { this->wait(); }
+ParameterServerController* ParameterServerController::createFromGflags() {
+  ParameterServerConfig config;
+  config.set_nics(FLAGS_nics);
+  config.set_rdma_tcp(FLAGS_rdma_tcp);
+  config.set_port(FLAGS_port);
+  config.set_ports_num(FLAGS_ports_num);
+  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
+  return create(config);
+}
+ParameterServerController* ParameterServerController::create(
+    const ParameterServerConfig& config) {
+  return new ParameterServerController(config);
+}
+void ParameterServerController::start() {
+  LOG(INFO) << "number of parameterServer instances: "
+            << parameterServers_.size();
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Starting parameterServer[" << i << "]";
+    parameterServer->start();
+    i++;
+  }
+}
+void ParameterServerController::wait() {
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Waiting parameterServer[" << i << "]";
+    parameterServer->join();
+    i++;
+  }
+}
+}  // namespace paddle
--- a/paddle/pserver/ParameterServerController.h
+++ b/paddle/pserver/ParameterServerController.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+namespace paddle {
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+public:
+  DISABLE_COPY(ParameterServerController);
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+}  // namespace paddle
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -34,6 +34,10 @@ class IScanner(object):
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
    def __init__(self, input_type, pos):
        IScanner.__init__(self, input_type, pos)
        self.__mat__ = None
@@ -47,6 +51,8 @@ class DenseScanner(IScanner):
    def finish_scan(self, argument):
        assert isinstance(argument, swig_paddle.Arguments)
        assert isinstance(self.input_type, dp2.InputType)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
        argument.setSlotValue(self.pos, m)

--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
    submit_local.sh
    @ONLY)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
        RENAME paddle)
+configure_file(tools/usage_stat/usage.sh
+    usage.sh
+    @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle_usage)
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -4,28 +4,32 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+RUN apt-get update && \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
-    python-protobuf python-numpy python-dev swig openssh-server \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
+    apt-get install -y automake clang-3.8 llvm-3.8 libclang-3.8-dev && \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    apt-get clean -y
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
+RUN pip install --upgrade pip && \ 
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    pip install 'protobuf==3.1.0.post1' && \
-    sphinx sphinx_rtd_theme recommonmark jupyter
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx_rtd_theme recommonmark jupyter
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j4 && make install && \
+    cd .. && rm -rf cmake-3.4.1
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 ENV WITH_GPU=OFF
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle

--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -4,28 +4,32 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+RUN apt-get update && \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
-    python-protobuf python-numpy python-dev swig openssh-server \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
+    apt-get install -y automake clang-3.8 llvm-3.8 libclang-3.8-dev && \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    apt-get clean -y
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
+RUN pip install --upgrade pip && \ 
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    pip install 'protobuf==3.1.0.post1' && \
-    sphinx sphinx_rtd_theme recommonmark jupyter
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx_rtd_theme recommonmark jupyter
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j4 && make install && \
+    cd .. && rm -rf cmake-3.4.1
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 ENV WITH_GPU=ON
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -122,6 +122,9 @@ case "$1" in
    "make_diagram")
        python -m paddle.utils.make_model_diagram ${@:2}
        ;;
+    "usage")
+        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        ;;
    "version")
        version
        ;;

--- a/paddle/scripts/tools/usage_stat/usage.sh
+++ b/paddle/scripts/tools/usage_stat/usage.sh
+#!/bin/bash
+ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
+KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
+# paddle config home dir, same as paddle
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+# api url, mirror url(s) will be append later
+PD_URLS="http://api.paddlepaddle.org/version"
+usage()
+{
+    echo "Usage: `basename $0` [options]"
+    echo "Options:"
+    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
+    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
+    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
+    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
+    echo "  -v, -i                            Verbose output and interact with user when necessary"
+    echo " --help                             display this help message"
+}
+eval set -- "${ARGPARSE}"
+while true; do
+    case "$1" in
+        -l|--log-file)
+            log_file=$2
+            shift 2
+            ;;
+        -e|--exit-code)
+            exit_code=$2
+            shift 2
+            ;;
+        -u|--git-user)
+            github_user=$2
+            shift 2
+            ;;
+        -n|--task-name)
+            task=$2
+            shift 2
+            ;;
+        -v|-i)
+            v=1
+            shift
+            ;;
+        --dry-run)
+            dry_run=1
+            shift
+            ;;
+        --)
+            shift
+            break
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Invalid option $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+# parse the log_file to get the time costs
+if [ -s "${log_file}" ]; then
+    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
+    {if(index($2,":")==3){
+        t=substr($2,1,8);
+        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
+        if(sec<last_sec-600){day+=1;sec+=86400;}
+        last_sec=sec;
+        if(min_sec==0 || min_sec>sec){min_sec=sec;}
+        if(max_sec==0 || max_sec<sec){max_sec=sec;}
+    }}
+    END{print max_sec-min_sec}' ${log_file}`
+else
+    duration=-1
+fi
+if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
+# try find the user/email if not given
+if [ -z "${github_user}" ]; then
+    # search for cached username
+    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
+        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
+        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
+    else
+        # search the github-user from git config
+        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
+        git_username=`git config --get user.name 2>/dev/null`
+        git_url=`git config --get remote.origin.url 2>/dev/null`
+        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
+            # under a git url, like https://github.com/user_xxx/proj_yyy.git
+            if [ "${v}" = "1" ]; then echo " from github url..."; fi
+            github_user=`echo ${git_url} | cut -d "/" -f 4`
+            if [ "${github_user}" = "PaddlePaddle" ]; then
+                github_user=
+            fi
+        fi
+        if [ -n "${git_username}" -a -z "${github_user}" ]; then
+            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
+            github_user=${git_username}
+        fi
+    fi
+fi
+# allow user to set the user name, if it's not found
+if [ -z "${github_user}" -a "${v}" = "1" ]; then
+    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
+    github_user=${REPLY}
+    if [ -z "${github_user}" ]; then
+        # empty input, consider as one anonymous user
+        github_user="${KEEP_ANONYMOUS}"
+    fi
+fi
+if [ -n "${github_user}" -a -z "${dry_run}" ]; then
+    # valid user and not in dry-run mode, then save to cache
+    mkdir -p ${PADDLE_CONF_HOME}
+    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
+fi
+if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
+if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
+    # anonymous user should keep the var empty.
+    github_user=
+fi
+# read local paddle version
+paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
+if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
+# read local system time
+system_time=`date "+%Y%m%d%H%M%S"`
+if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
+# make empty job_name as default value.
+if [ -z "${task}" ]; then
+    task="(unknown_task)"
+fi
+if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
+# concat the curl command
+params="content={\"data_type\":\"usage\",\
+\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
+\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
+\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
+}&type=1"
+curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
+ -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
+if [ "${dry_run}" = "1" ]; then
+    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
+    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
+    exit 0
+else
+    for u in ${PD_URLS}; do
+        curl_cmd="${curl_cmd_prefix} ${u}"
+        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
+        ${curl_cmd} >/dev/null 2>&1
+        if [ $? -eq 0 ]; then
+            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
+            exit 0
+        else
+            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
+        fi
+    done
+    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
+    exit 1
+fi
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
 #!/bin/bash
 brew update
 brew tap homebrew/science
-brew install python
+brew install openblas swig md5sha1sum
-sudo pip install --upgrade protobuf
-brew install swig openblas md5sha1sum protobuf
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,7 +6,7 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
  export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
  export PYTHONHOME=/opt/python/2.7.12
  export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+  cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
  NRPOC=`nproc`
  make -j $NPROC
  make coveralls

--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 # check websites for broken links

--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/pserver/ParameterServer2.h"
+#include <fenv.h>
-#include "paddle/utils/Common.h"
+#include "paddle/pserver/ParameterServerController.h"
 #include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/StringUtil.h"
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/RDMANetwork.h"
 DEFINE_bool(start_pserver, false, "Whether to start pserver");
 DECLARE_int32(gpu_id);
@@ -38,54 +36,11 @@ int main(int argc, char** argv) {
  initMain(argc, argv);
  initPython(argc, argv);
-  std::vector<std::unique_ptr<ParameterServer2>> pservers;
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
-  std::vector<std::string> devices;
  if (FLAGS_start_pserver) {
-    // round robin to loadbalance RDMA server ENGINE
+    parameterServerPtr.reset(
-    int rdmaCpu = 0;
+        paddle::ParameterServerController::createFromGflags());
-    int onlineCpus = rdma::numCpus();
+    parameterServerPtr->start();
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    if (FLAGS_nics.empty()) {
-      pservers.resize(numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i));
-        }
-        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                   << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << FLAGS_port + i;
-        pservers[i]->start();
-      }
-    } else {
-      str::split(FLAGS_nics, ',', &devices);
-      pservers.resize(devices.size() * numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        for (size_t j = 0; j < devices.size(); ++j) {
-          if (FLAGS_rdma_tcp == "rdma") {
-            pservers[i * devices.size() + j].reset(new ParameterServer2(
-                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-            rdmaCpu = rdmaCpu % onlineCpus;
-          } else {
-            pservers[i * devices.size() + j].reset(
-                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-          }
-          CHECK(pservers[i * devices.size() + j]->init())
-              << "Fail to initialize parameter server" << devices[j]
-              << FLAGS_port + i;
-          LOG(INFO) << "pserver started : " << devices[j] << ":"
-                    << FLAGS_port + i;
-          pservers[i * devices.size() + j]->start();
-        }
-      }
-    }
  }
  Trainer trainer;
  auto config = TrainerConfigHelper::createFromFlags();

--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -4,7 +4,8 @@ set(proto_filenames
    ModelConfig.proto
    ParameterConfig.proto
    ParameterService.proto
-    TrainerConfig.proto)
+    TrainerConfig.proto
+    ParameterServerConfig.proto)
 set(PROTO_GEN)
 set(PROTO_GEN_PY)

--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle;
+/**
+ * Configuration structure for ParameterClient2.
+ */
+message ParameterClientConfig {
+  required int32 trainer_id = 1;
+}
+/**
+ * Configuration structure for ParameterServer2.
+ */
+message ParameterServerConfig {
+  // The ports number for parameter send,
+  // increment based on default port number
+  required int32 ports_num = 1 [default = 1];
+  // The ports number for parameter send,
+  // increment based on default (port + ports_num
+  required int32 ports_num_for_sparse = 2 [default = 0];
+  // network device name for pservers
+  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string rdma_tcp = 4 [default = "tcp"];
+  // Listening port for pserver
+  required int32 port = 5 [default = 20134];
+  // number of gradient servers
+  required int32 num_gradient_servers = 6 [default = 1];
+  // number of threads for sync op exec
+  required int32 pserver_num_threads = 7 [default = 1];
+  // control config_.async_lagged_grad_discard_ratio() min value
+  required double async_lagged_ratio_min = 8 [default = 1.0];
+  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
+  // use it as defalut value
+  required double async_lagged_ratio_default = 9 [default = 1.5];
+}
\ No newline at end of file
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2628,7 +2628,7 @@ class AverageLayer(LayerBase):
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=5, device=None):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
        super(CosSimLayer, self).__init__(
            name, 'cos', 1, inputs=inputs, device=device)
        config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1673,7 +1673,7 @@ def trans_layer(input, name=None, layer_attr=None):
 @wrap_name_default()
 @layer_support()
-def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
    """
    Cosine Similarity Layer. The cosine similarity equation is here.

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -79,7 +79,7 @@ layers {
  inputs {
    input_layer_name: "b"
  }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
  name: "__cos_sim_1__"
@@ -92,7 +92,7 @@ layers {
  inputs {
    input_layer_name: "c"
  }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
  name: "__sum_to_one_norm_layer_0__"