update code

47e88f4a · dangqingqing · 21d3889d · 5d28da43 · 47e88f4a · 47e88f4a
85 changed file
--- a/.travis.yml
+++ b/.travis.yml
 language: cpp
-cache: ccache
+cache:
+  directories:
+    - $HOME/third_party
+    - $HOME/.ccache
+    - $HOME/.cache/pip
 sudo: required
 dist: trusty
 os:
@@ -35,6 +39,7 @@ addons:
      - clang-format-3.8
      - automake
      - libtool
+      - ccache
 before_install:
  - |
    if [ ${JOB} == "BUILD_AND_TEST" ]; then

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -30,7 +30,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" OFF)
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
 option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
@@ -43,6 +43,16 @@ option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
 option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+# CMAKE_BUILD_TYPE
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
+endif()
+set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+  "A path setting third party libraries download & build directories.")
 ########################################################################################
 include(external/zlib)      # download, build, install zlib

--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -14,8 +14,8 @@
 INCLUDE(ExternalProject)
-SET(GFLAGS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
+SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
-SET(GFLAGS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -14,8 +14,8 @@
 INCLUDE(ExternalProject)
-SET(GLOG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/glog)
+SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
-SET(GLOG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
 IF(WIN32)

--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -16,8 +16,8 @@ IF(WITH_TESTING)
    ENABLE_TESTING()
    INCLUDE(ExternalProject)
-    SET(GTEST_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gtest)
+    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
-    SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gtest)
+    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -18,8 +18,8 @@ IF(NOT ${CBLAS_FOUND})
    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
    INCLUDE(ExternalProject)
-    SET(CBLAS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
+    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
-    SET(CBLAS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/openblas)
+    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
    IF(WIN32)

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,8 +14,8 @@
 INCLUDE(ExternalProject)
-SET(PROTOBUF_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
+SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
-SET(PROTOBUF_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/protobuf)
+SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
 SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
 INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})

--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -26,10 +26,14 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
    find_python_module(wheel REQUIRED)
    find_python_module(google.protobuf REQUIRED)
    FIND_PACKAGE(NumPy REQUIRED)
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+        "please use pip to upgrade protobuf.")
+    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
    ##################################### PYTHON ########################################
-    SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
+    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
-    SET(PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/python)
+    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
    SET(_python_DIR ${PYTHON_INSTALL_DIR})
    IF(UNIX)

--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -18,8 +18,8 @@ IF(NOT SWIG_FOUND)
    # build swig as an external project
    INCLUDE(ExternalProject)
-    SET(SWIG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/swig)
+    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
-    SET(SWIG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/swig)
+    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
    SET(SWIG_TARGET_VERSION "3.0.2")
    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")

--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,8 +14,8 @@
 INCLUDE(ExternalProject)
-SET(WARPCTC_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/warpctc)
+SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
-SET(WARPCTC_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/warpctc)
+SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -14,8 +14,8 @@
 INCLUDE(ExternalProject)
-SET(ZLIB_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
+SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
-SET(ZLIB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/zlib)
+SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -3,12 +3,6 @@ include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-        FORCE)
-endif()
 function(CheckCompilerCXX11Flag)
    if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)

--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
@@ -26,5 +26,18 @@ function(find_python_module module)
    if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
        message(FATAL_ERROR "python module ${module} is not found")
    endif()
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+        OUTPUT_VARIABLE _${module}_version
+        RESULT_VARIABLE _${module}_status
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _${module}_status)
+        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+            "Version of Python module ${module}")
+    endif(NOT _${module}_status)
    set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
 endfunction(find_python_module)
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -108,6 +108,7 @@ function(link_paddle_exe TARGET_NAME)
    endif()
    if(WITH_GPU)
+        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
        if(NOT WITH_DSO OR WITH_METRIC)
            target_link_libraries(${TARGET_NAME}
                ${CUDNN_LIBRARY}

--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
 python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
    --save_dir=./output \
    --num_passes=30 \
    2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=100 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
 python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
 pushd "$home_dir"
 cfg=trainer_config.lr.py
 paddle train \
+  --start_pserver=false \
  --config=$cfg \
  --save_dir=${model_dir} \
  --trainer_count=4 \

--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
    --init_model_path=$model \
    --config_args=is_predict=1 \
    --predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
 mv rank-00000 result.txt
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
  --show_parameter_stats_period=100 \
  --test_all_data_in_one_period=1 \
  2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
    --log_period=100 \
    --dot_period=1 \
    --num_passes=50  2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
  --config_args=is_test=1 \
  --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
  --load_missing_parameter_strategy=rand \
  --test_all_data_in_one_period=1 \
  2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
             --trainer_count=4 \
             --config_args=is_test=1 \
             2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
             --show_parameter_stats_period=100 \
             --test_all_data_in_one_period=1 \
             2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
    --log_period=10 \
    --dot_period=5 \
    2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
    --test_pass=12 \
    --trainer_count=1 \
    2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
 --log_period=10 \
 --dot_period=5 \
 2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
       --dot_period=10 \
       --log_period=1000 \
       --test_period=0 \
-       --num_passes=10
+       --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
       --log_period=10000 \
       --test_period=0 \
       --num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
        paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
---------------------------------------------------
-目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
-:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
-..  code-block:: bash
-    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
-..  code-block:: bash
-    git submodule init
-    git submodule update
-来获得所有第三方模块。
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -11,32 +11,21 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
-git submodule update --init --recursive
 ```
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
-```
-git submodule update --remote
-```
 ## <span id="requirements">Requirements</span>
 To compile the source code, your computer must be equipped with the following dependencies.
 - **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
- **CMake**: version >= 2.8
+- **CMake**: version >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
- **Python**: only python 2.7 is supported currently
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
 ### Options
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+PaddlePaddle supports some build options. 
 <html>
 <table> 
@@ -47,12 +36,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </tr>
 </thead>
 <tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
+<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
+<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
+<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
+<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
+<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
+<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
+<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
+<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
+<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
+<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
+<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
 </table>
 </html>
@@ -64,18 +62,15 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 As a simple example, consider the following:  
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
-    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
    ```bash
-    # install swig on ubuntu
+    # specify MKL
-    sudo apt-get install swig
+    cmake .. -DMKL_ROOT=<mkl_path>
-    # install swig on Mac OS X
+    # or specify OpenBLAS
-    brew install swig
+    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    # active swig in cmake
-    cmake .. -DWITH_SWIG_PY=ON
    ```
 2. **Doc Dependencies(optional)**
@@ -104,17 +99,9 @@ As a simple example, consider the following:
    ```bash
    # necessary
    sudo apt-get update
-    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
-    # optional
+    sudo pip install wheel numpy
-    sudo apt-get install libgoogle-glog-dev
+    sudo pip install 'protobuf>=3.0.0'
-    sudo apt-get install libgflags-dev
-    sudo apt-get install libgtest-dev
-    sudo pip install wheel
-    pushd /usr/src/gtest
-    cmake .
-    make
-    sudo cp *.a /usr/lib
-    popd
    ```
 - **GPU Dependencies (optional)**
@@ -149,51 +136,17 @@ As usual, the best option is to create build folder under paddle project directo
 ```bash
 mkdir build && cd build
-cmake ..
+``` 
-```
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
-As a simple example, consider the following:
- **Only CPU with swig**
+Finally, you can build and install PaddlePaddle:
-  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
-  ```
- **GPU with swig**
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
-  ```
- **GPU with doc and swig**
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
-Finally, you can build PaddlePaddle:
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
-```
+# install PaddlePaddle Python modules.
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-```bash
-# you can run
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
 ```
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -40,4 +40,4 @@ PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。
    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
 注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,23 +16,13 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
-The general development workflow with Docker and Bazel is as follows:
+The general development workflow with Docker and CMake is as follows:
 1. Get the source code of Paddle:
   .. code-block:: bash
-      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+      git clone https://github.com/PaddlePaddle/Paddle.git
-   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
-   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
-   empty, please use the following command to get the submodule.
-   .. code-block:: bash
-      git submodule update --init --recursive
 2. Build a development Docker image :code:`paddle:dev` from the source
@@ -162,7 +152,6 @@ source code:
   cd ~
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
-   git submodule update --init --recursive
   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .

--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -33,7 +33,6 @@ cd Paddle
 git checkout -b develop  # 创建 develop 分支
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
 git pull upstream develop  # 更新 upstream
-git submodule update --init --recursive
 ```
 然后你可以通过做一个本地开发分支开始开发

--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -38,7 +38,6 @@ cd Paddle
 git checkout -b develop  # create develop branch.
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
-git submodule update --init --recursive
 ```
 Then you can start to develop by making a local developement branch

--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -159,6 +159,8 @@ docker build -t your_repo/paddle:mypaddle .
 docker push  your_repo/paddle:mypaddle
 ```
+注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
 ### 上传训练文件
 本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
@@ -244,6 +246,8 @@ spec:
 `CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 ```bash

--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -137,6 +137,10 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }
+float Arguments::sumCosts() const {
+  return paddle::Argument::sumCosts(m->outputs);
+}
 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
  auto& a = m->getArg(idx);
  return a.getBatchSize();

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -450,6 +450,8 @@ public:
                                        IVector* vec) throw(RangeError);
  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
+  float sumCosts() const;
 private:
  static Arguments* createByPaddleArgumentVector(void* ptr);
  void* getInternalArgumentsPtr() const;
@@ -546,6 +548,10 @@ public:
  ParameterConfig* getConfig();
  void setValueUpdated();
+  bool save(const std::string& filename) const;
+  bool load(const std::string& filename) const;
  size_t getSize() const;
 private:

--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -57,4 +57,12 @@ size_t Parameter::getID() const { return m->getPtr()->getID(); }
 void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
 size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -13,5 +13,5 @@ GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@"
+CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
--- a/paddle/api/test/.gitignore
+++ b/paddle/api/test/.gitignore
+*.w0
+*.wbias
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,6 +22,8 @@ class TestArguments(unittest.TestCase):
        args = swig_paddle.Arguments.createArguments(1)
        args.setSlotValue(0, m)
+        self.assertAlmostEqual(27.0, args.sumCosts())
        mat = args.getSlotValue(0)
        assert isinstance(mat, swig_paddle.Matrix)
        np_mat = mat.toNumpyMatInplace()

--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -45,6 +45,7 @@ class TestGradientMachine(unittest.TestCase):
            assert isinstance(val, swig_paddle.Vector)
            arr = numpy.full((len(val), ), 0.1, dtype="float32")
            val.copyFromNumpyArray(arr)
+            self.assertTrue(param.save(param.getName()))
            param_config = param.getConfig().toProto()
            assert isinstance(param_config,
                              paddle.proto.ParameterConfig_pb2.ParameterConfig)
@@ -92,6 +93,9 @@ class TestGradientMachine(unittest.TestCase):
        self.assertTrue(self.isCalled)
+        for param in machine.getParameters():
+            self.assertTrue(param.load(param.getName()))
    def test_train_one_pass(self):
        conf_file_path = './testTrainConfig.py'
        trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
 endif()
 set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cudart_wrap.cc
    src/hl_cuda_cublas.cc
    src/hl_cuda_cudnn.cc
    src/hl_cuda_device.cc)

--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
 */
 void GetCudnnDsoHandle(void** dso_handle);
-/**
- * @brief    load the DSO of CUDA Run Time
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudartDsoHandle(void** dso_handle);
 /**
 * @brief    load the DSO of CURAND
 *

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -22,10 +22,9 @@ limitations under the License. */
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
-#include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
 // clang-format on
@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-#endif
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)        \
-  __macro(cudaMalloc)                     \
-  __macro(cudaHostAlloc)                  \
-  __macro(cudaFree)                       \
-  __macro(cudaFreeHost)                   \
-  __macro(cudaMemcpy)                     \
-  __macro(cudaMemset)                     \
-  __macro(cudaMemcpyAsync)                \
-  __macro(cudaSetDevice)                  \
-  __macro(cudaGetDevice)                  \
-  __macro(cudaGetDeviceCount)             \
-  __macro(cudaGetDeviceProperties)        \
-  __macro(cudaDeviceSynchronize)          \
-  __macro(cudaDeviceCanAccessPeer)        \
-  __macro(cudaDeviceEnablePeerAccess)     \
-  __macro(cudaStreamCreate)               \
-  __macro(cudaStreamDestroy)              \
-  __macro(cudaStreamSynchronize)          \
-  __macro(cudaStreamWaitEvent)            \
-  __macro(cudaEventCreate)                \
-  __macro(cudaEventRecord)                \
-  __macro(cudaEventQuery)                 \
-  __macro(cudaEventDestroy)               \
-  __macro(cudaEventSynchronize)           \
-  __macro(cudaEventElapsedTime)           \
-  __macro(cudaSetDeviceFlags)             \
-  __macro(cudaGetLastError)               \
-  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)             \
-  __macro(cudaProfilerStart)              \
-  __macro(cudaProfilerStop)
-// clang-format on
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
 } /* namespace dynload */
 /**
@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0;
 * Check build-in cuda function using glog and it **does not**
 * support << operator for more details error info.
 */
-#define CHECK_CUDA(cudaFunc)                                                  \
+#define CHECK_CUDA(cudaFunc)                                         \
-  do {                                                                        \
+  do {                                                               \
-    cudaError_t cudaStat = cudaFunc;                                          \
+    cudaError_t cudaStat = cudaFunc;                                 \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
-                                    << dynload::cudaGetErrorString(cudaStat); \
+                                    << cudaGetErrorString(cudaStat); \
  } while (0)
 /**
@@ -284,13 +211,13 @@ void hl_fini() {
      tmp_stream = (char *)t_device[dev]->stream;
    }
    for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
    }
    /* free device memory */
    hl_free_mem_device(t_device[dev]->gpu_mem);
    hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
  }
  free(tmp);
@@ -308,7 +235,7 @@ void hl_set_device(int device) {
  CHECK(device >= 0 && device < g_system_device_num && g_device[device])
      << "Device: " << device << " is not specified in startup.";
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
  /* switch thread stream */
  for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -336,7 +263,7 @@ void hl_set_device(int device) {
 int hl_get_device() {
  int device;
-  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  CHECK_CUDA(cudaGetDevice(&device));
  return device;
 }
@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
  void *dest_d;
  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
  return dest_d;
 }
@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
 void hl_free_mem_device(void *dest_d) {
  CHECK_NOTNULL(dest_d);
-  cudaError_t err = dynload::cudaFree(dest_d);
+  cudaError_t err = cudaFree(dest_d);
  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
      << hl_get_device_error_string();
 }
@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
  void *dest_h;
  CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
-      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
  return dest_h;
 }
@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
 void hl_free_mem_host(void *dest_h) {
  CHECK_NOTNULL(dest_h);
-  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  cudaError_t err = cudaFreeHost(dest_h);
  CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
      << hl_get_device_error_string();
 }
@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
  }
  CHECK_NOTNULL(dst);
  CHECK_NOTNULL(src);
-  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 }
 void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
 }
 void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
  }
  CHECK_NOTNULL(src_h);
  CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
  }
  CHECK_NOTNULL(dest_h);
  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
  }
  CHECK_NOTNULL(dest_d);
  CHECK_NOTNULL(src_d);
-  CHECK_CUDA(
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
-      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
  CHECK_LT(stream, HPPL_STREAM_END);
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
-      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 void hl_start() {
@@ -439,8 +363,7 @@ void hl_start() {
 bool hl_device_can_access_peer(int device, int peerDevice) {
  int canAccessPeer;
-  CHECK_CUDA(
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
-      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
  if (canAccessPeer == 1) {
    return true;
@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
 }
 void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
  if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    dynload::cudaGetLastError();
+    cudaGetLastError();
  } else {
    CHECK_CUDA(err);
  }
@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
  int device = device_prop->device;
  global_device_resources device_res = device_prop->device_resources;
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
  /* device properties */
-  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
  device_prop->major = cu_prop.major;
  device_prop->minor = cu_prop.minor;
@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
  /* create device stream */
  for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
  }
  /* cublas init */
@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
  device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
  pthread_mutex_init(device_res->gen_mutex, NULL);
-  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 int hl_get_cuda_version() { return g_cuda_lib_version; }
 void hl_create_thread_resources(int device,
                                thread_device_resources device_res) {
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
  /* create thread stream */
  for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
  }
  /* allocation device memory */
@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
  /* allocation host memory */
  device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
-  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
 }
 void hl_specify_devices_start(int *device, int number) {
  if (hl_start_flag) return;
  /* 1. get the number of devices */
-  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
  CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
  if (device == NULL) {
    number = g_system_device_num;
@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
                                    << ": the parameter stream is error.";
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
 }
 void hl_create_event(hl_event_t *event) {
@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
  struct _hl_event_st *st_event =
      (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
-  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
  *event = st_event;
 }
@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
  CHECK_NOTNULL(start);
  CHECK_NOTNULL(end);
-  CHECK_CUDA(
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
-      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
  return time;
 }
@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
                                    << ": the parameter stream is error.";
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
 }
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
                                    << ": the parameter stream is error.";
  cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 void hl_destroy_event(hl_event_t event) {
  CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
  free(event);
  event = NULL;
@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
 void hl_event_synchronize(hl_event_t event) {
  CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
 }
 void hl_get_device_name(char *name, int len, int device) {
@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
  *minor = g_device[device]->minor;
 }
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
 const char *hl_get_device_error_string() {
-  cudaError_t err = dynload::cudaGetLastError();
+  cudaError_t err = cudaGetLastError();
-  return dynload::cudaGetErrorString(err);
+  return cudaGetErrorString(err);
 }
 const char *hl_get_device_error_string(size_t err) {
-  return dynload::cudaGetErrorString((cudaError_t)err);
+  return cudaGetErrorString((cudaError_t)err);
 }
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  cudaError_t err = cudaEventQuery(event->cu_event);
  CHECK(cudaSuccess == err || cudaErrorNotReady == err);
  if (cudaErrorNotReady == err) {
@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
  return true;
 }
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#ifdef PADDLE_USE_DSO
-#include <cuda_runtime.h>
-#include <mutex>
-#include "hl_dso_loader.h"
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    __type operator()(Args... args) {                                          \
-      typedef __type (*cudartFunc)(Args...);                                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)          \
-  __macro(cudaLaunch, cudaError_t)          \
-  __macro(cudaSetupArgument, cudaError_t)   \
-  __macro(cudaConfigureCall, cudaError_t)   \
-  __macro(__cudaRegisterFatBinary, void**)  \
-  __macro(__cudaUnregisterFatBinary, void)  \
-  __macro(__cudaRegisterFunction, void)     \
-  __macro(__cudaRegisterVar, void)          \
-  __macro(__cudaRegisterManagedVar, void)   \
-  __macro(__cudaInitModule, char)           \
-  __macro(__cudaRegisterTexture, void)      \
-  __macro(__cudaRegisterSurface, void)
-// clang-format on
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-#undef CUDA_ROUNTINE_EACH
-} /* namespace dynload */
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
-                                                dim3 gridDim,
-                                                dim3 blockDim,
-                                                void **args,
-                                                size_t sharedMem,
-                                                cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(
-      func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  return dynload::cudaLaunch(func);
-}
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                 size_t size,
-                                                 size_t offset) {
-  return dynload::cudaSetupArgument(arg, size, offset);
-}
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
-                                                 dim3 blockDim,
-                                                 size_t sharedMem,
-                                                 cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-extern "C" {
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
-                                      const char *hostFun,
-                                      char *deviceFun,
-                                      const char *deviceName,
-                                      int thread_limit,
-                                      uint3 *tid,
-                                      uint3 *bid,
-                                      dim3 *bDim,
-                                      dim3 *gDim,
-                                      int *wSize) {
-  return dynload::__cudaRegisterFunction(fatCubinHandle,
-                                         hostFun,
-                                         deviceFun,
-                                         deviceName,
-                                         thread_limit,
-                                         tid,
-                                         bid,
-                                         bDim,
-                                         gDim,
-                                         wSize);
-}
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
-                                 char *hostVar,
-                                 char *deviceAddress,
-                                 const char *deviceName,
-                                 int ext,
-                                 int size,
-                                 int constant,
-                                 int global) {
-  return dynload::__cudaRegisterVar(fatCubinHandle,
-                                    hostVar,
-                                    deviceAddress,
-                                    deviceName,
-                                    ext,
-                                    size,
-                                    constant,
-                                    global);
-}
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
-                                               void **hostVarPtrAddress,
-                                               char *deviceAddress,
-                                               const char *deviceName,
-                                               int ext,
-                                               int size,
-                                               int constant,
-                                               int global) {
-  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
-                                           hostVarPtrAddress,
-                                           deviceAddress,
-                                           deviceName,
-                                           ext,
-                                           size,
-                                           constant,
-                                           global);
-}
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  return dynload::__cudaInitModule(fatCubinHandle);
-}
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
-                                     const struct textureReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int norm,
-                                     int ext) {
-  return dynload::__cudaRegisterTexture(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
-                                     const struct surfaceReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int ext) {
-  return dynload::__cudaRegisterSurface(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-} /* extern "C" */
-#endif
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
 DEFINE_string(cuda_dir,
              "",
              "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
-              "libcudart can not be specified by cuda_dir, since some "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
-              "build-in function in cudart already ran before main entry). "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
 #endif
 }
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);

--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <glog/logging.h>
+#include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
+namespace paddle {
+const SequenceArg& BufferArg::sequence() const {
+  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+const SparseMatrixArg& BufferArg::sparse() const {
+  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+}  // namespace paddle
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+enum BufferType {
+  TENSOR_NORMAL = 0,
+  TENSOR_SEQUENCE_ID = 1,
+  TENSOR_SEQUENCE_DATA = 2,
+  TENSOR_SPARSE = 3
+};
+enum SparseDataType {
+  SPARSE_NO_VALUE = 0,  // do not need value pointer, all values are 1
+  SPARSE_FLOAT_VALUE = 1
+};
+enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+  ArgType getArgType() const { return argType_; }
+public:
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {}
+  BufferArg(void* buf, ValueType valueType)
+      : buf_(buf), valueType_(valueType) {}
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    shape_.setDim(0, vector.getSize());
+  }
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    shape_.setDim(0, vector.getSize());
+  }
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+  virtual ~BufferArg() {}
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_;
+  ArgType argType_ = UNSPECIFIED;
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    CHECK_EQ(shape_.ndims(), (size_t)1);
+    numSeqs_ = shape_[0] - 1;
+  }
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    numSeqs_ = shape_[0] - 1;
+  }
+  ~SequenceIdArg() {}
+  size_t numSeqs() const { return numSeqs_; }
+private:
+  size_t numSeqs_;
+};
+// sequence data
+class SequenceArg : public BufferArg {
+public:
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {}
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {}
+  ~SequenceArg() {}
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+private:
+  SequenceIdArg startPositions_;
+};
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseDataFormat format,
+                  SparseDataType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(format),
+        type_(type) {
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), (size_t)2);
+    CHECK_EQ(row_.shape().ndims(), (size_t)1);
+    CHECK_EQ(col_.shape().ndims(), (size_t)1);
+    if (format == SPARSE_CSR_FORMAT) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format == SPARSE_CSC_FORMAT) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+  ~SparseMatrixArg() {}
+  void* getRowBuf() const { return row_.data(); }
+  void* getColBuf() const { return col_.data(); }
+  size_t nnz() const { return nnz_; }
+  SparseDataFormat dataFormat() const { return format_; }
+  SparseDataType dataType() const { return type_; }
+private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+}  // namespace paddle
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/MemoryHandle.h"
+#include "paddle/math/SparseMatrix.h"
+namespace paddle {
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9);
+}
+TEST(BufferTest, asArgument) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  VectorPtr vector = Vector::create(100, false);
+  CpuSparseMatrix sparse(200, 300, 50);
+  // prepare arguments
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  argments.addArg(*vector);
+  argments.addArg(sparse);
+  // function
+  auto function = [=](const BufferArgs& inputs) {
+    EXPECT_EQ(inputs.size(), 3);
+    // check inputs[0]
+    EXPECT_EQ(inputs[0].shape().ndims(), 2);
+    EXPECT_EQ(inputs[0].shape()[0], 100);
+    EXPECT_EQ(inputs[0].shape()[1], 200);
+    EXPECT_EQ(inputs[0].data(), matrix->getData());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
+              matrix->getHeight());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
+              matrix->getWidth());
+    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+    // check inputs[1]
+    EXPECT_EQ(inputs[1].shape().ndims(), 1);
+    EXPECT_EQ(inputs[1].shape()[0], 100);
+    EXPECT_EQ(inputs[1].data(), vector->getData());
+    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+    // check inputs[2]
+    EXPECT_EQ(inputs[2].shape().ndims(), 2);
+    EXPECT_EQ(inputs[2].shape()[0], 200);
+    EXPECT_EQ(inputs[2].shape()[1], 300);
+    EXPECT_EQ(inputs[2].data(), sparse.getData());
+    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
+    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
+  };
+  // call function
+  function(argments);
+}
+}  // namespace paddle
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB cpp_files . *Op.cpp)
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
 if(WITH_GPU)
    file(GLOB cu_files . *OpGpu.cu)
@@ -18,9 +19,13 @@ if(WITH_TESTING)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(CrossMapNormalOpTest)
+    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
    add_simple_unittest(PadOpTest)
-    add_simple_unittest(ContextProjectionOpTest)
+    # add_simple_unittest(ContextProjectionOpTest)
 endif()
 endif()

--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
-                                               const CpuMatrix* input_mat,
+                                               const CpuMatrix& input_mat,
-                                               const CpuMatrix* weight_mat,
+                                               const CpuMatrix& weight_mat,
                                               const CpuIVector& seq_vec,
                                               size_t context_length,
                                               int context_start,
                                               size_t begin_pad) {
  const int* starts = seq_vec.getData();
  const size_t num_sequences = seq_vec.getSize() - 1;
-  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
-  auto in_mat = const_cast<CpuMatrix*>(input_mat);
  for (size_t i = 0; i < num_sequences; ++i) {
    for (size_t j = 0; j < context_length; ++j) {
      int begin = starts[i] + context_start + j;
@@ -39,10 +37,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
      if (begin < starts[i]) {
        int64_t pad_size =
            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
-        if (w_mat) {
+        if (weight_mat) {
-          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
+          MatrixPtr sub =
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
        }
        dst_begin = starts[i] + pad_size;
        begin = starts[i];
@@ -50,19 +49,22 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
      if (end > starts[i + 1]) {
        int64_t pad_size =
            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (w_mat) {
+        if (weight_mat) {
-          MatrixPtr sub = w_mat->subMatrix(
+          MatrixPtr sub =
-              begin_pad + context_start + j - pad_size, pad_size);
+              const_cast<CpuMatrix&>(weight_mat)
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
        }
        dst_end = starts[i + 1] - pad_size;
        end = starts[i + 1];
      }
      if (end <= begin) continue;
-      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
+      MatrixPtr src =
-      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
-      dst->addAtOffset(*src, j * in_mat->getWidth());
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
    }
  }
 }
@@ -82,40 +84,32 @@ public:
    begin_pad_ = config.get<size_t>("begin_pad");
  }
-  void calc(const Arguments& inputs,
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-            const Arguments& outputs,
+    CHECK_EQ((size_t)3, inputs.size());
-            const Arguments& inouts) override {
+    CHECK_EQ((size_t)1, outputs.size());
-    CHECK_EQ(3, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
+    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto out_mat = outputs[0].matrix<Device>();
-    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
+    auto in_mat = inputs[0].matrix<Device>();
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    auto w_mat = !inputs[1].data()
-    const auto w_mat =
+                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-        !inputs[1].getData()
+                     : inputs[1].matrix<Device>();
-            ? nullptr
+    auto seq_vec = inputs[2].vector<int, Device>();
-            : std::make_shared<typename MatrixT<Device>::type>(
+    ContextProjectionForward<Device>(out_mat,
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+                                     in_mat,
-    typename SequenceT<Device>::type seq_vec(
+                                     w_mat,
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-    ContextProjectionForward<Device>(out_mat.get(),
-                                     in_mat.get(),
-                                     w_mat.get(),
                                     seq_vec,
                                     context_length_,
                                     context_start_,
@@ -129,18 +123,17 @@ private:
 };
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
-                                                CpuMatrix* in_grad_mat,
+                                                CpuMatrix& in_grad_mat,
-                                                CpuMatrix* w_grad_mat,
+                                                CpuMatrix& w_grad_mat,
                                                const CpuIVector& seq_vec,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad,
                                                bool is_padding,
                                                size_t total_pad) {
-  CHECK(out_grad_mat);
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
-  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
-                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
  const int* starts = seq_vec.getData();
  size_t num_sequences = seq_vec.getSize() - 1;
  for (size_t i = 0; i < num_sequences; ++i) {
@@ -153,8 +146,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
        int64_t pad_size =
            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
        if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
+          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
          sub->addAtOffset(*mat, j * input_dim);
        }
        dst_begin = starts[i] + pad_size;
@@ -165,8 +158,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
        if (is_padding && w_grad_mat) {
          MatrixPtr mat =
-              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(
+          MatrixPtr sub = w_grad_mat.subMatrix(
              begin_pad + context_start + j - pad_size, pad_size);
          sub->addAtOffset(*mat, j * input_dim);
        }
@@ -175,8 +168,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
      }
      if (end <= begin) continue;
      if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
      src->addAtOffset(*dst, j * input_dim);
    }
  }
@@ -199,44 +192,36 @@ public:
    total_pad_ = config.get<size_t>("total_pad");
  }
-  void calc(const Arguments& inputs,
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-            const Arguments& outputs,
+    CHECK_EQ((size_t)3, inputs.size());
-            const Arguments& inouts) override {
+    CHECK_EQ((size_t)1, outputs.size());
-    CHECK_EQ(3, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(outputs[0].getData() && inputs[2].getData());
+    CHECK(outputs[0].data() && inputs[2].data());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
+    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    auto in_grad_mat =
-        !inputs[0].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    auto w_grad_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-    ContextProjectionBackward<Device>(out_grad_mat.get(),
+    auto out_grad_mat = outputs[0].matrix<Device>();
-                                      in_grad_mat ? in_grad_mat.get() : nullptr,
+    auto in_grad_mat =
-                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[0].matrix<Device>();
+    auto w_grad_mat = !inputs[1].data()
+                          ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                          : inputs[1].matrix<Device>();
+    auto seq_vec = inputs[2].vector<int, Device>();
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
                                      seq_vec,
                                      context_length_,
                                      context_start_,
@@ -253,6 +238,7 @@ private:
  size_t total_pad_;
 };
+#if 0
 /**
 * \param inputs[0] input grad.
 * \param inputs[1] input sequence.
@@ -349,6 +335,7 @@ private:
  size_t begin_pad_;
  size_t total_pad_;
 };
+#endif
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                    CPU,
@@ -363,6 +350,7 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                    GPU,
                    ContextProjectionBackwardFunc);
+#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                    GPU,
                    ContextProjectionBackwardDataFunc);
@@ -370,4 +358,5 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                    GPU,
                    ContextProjectionBackwardWeightFunc);
 #endif
+#endif
 }  // namespace paddle
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -31,14 +31,15 @@ namespace paddle {
 * \param[in]   is_padding        whether padding 0 or not.
 *
 */
-template <DeviceType Device>
+template <DeviceType DType>
-void ContextProjectionForward(typename MatrixT<Device>::type* output,
+void ContextProjectionForward(
-                              const typename MatrixT<Device>::type* input,
+    typename Tensor<real, DType>::Matrix& output,
-                              const typename MatrixT<Device>::type* weight,
+    const typename Tensor<real, DType>::Matrix& input,
-                              const typename SequenceT<Device>::type& sequence,
+    const typename Tensor<real, DType>::Matrix& weight,
-                              size_t context_length,
+    const typename Tensor<int, DType>::Vector& sequence,
-                              int context_start,
+    size_t context_length,
-                              size_t begin_pad);
+    int context_start,
+    size_t begin_pad);
 /**
 * \brief   Context Projection Backward.
@@ -53,30 +54,31 @@ void ContextProjectionForward(typename MatrixT<Device>::type* output,
 * \param[in]   is_padding        whether padding 0 or not.
 *
 */
-template <DeviceType Device>
+template <DeviceType DType>
-void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
+void ContextProjectionBackward(
-                               typename MatrixT<Device>::type* in_grad,
+    typename Tensor<real, DType>::Matrix& out_grad,
-                               typename MatrixT<Device>::type* w_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
-                               const typename SequenceT<Device>::type& seq_vec,
+    typename Tensor<real, DType>::Matrix& w_grad,
-                               size_t context_length,
+    const typename Tensor<int, DType>::Vector& seq_vec,
-                               int context_start,
+    size_t context_length,
-                               size_t begin_pad,
+    int context_start,
-                               bool is_padding,
+    size_t begin_pad,
-                               size_t total_pad);
+    bool is_padding,
+    size_t total_pad);
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename MatrixT<Device>::type* out_grad,
+    typename Tensor<real, DType>::Matrix& out_grad,
-    typename MatrixT<Device>::type* in_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
-    const typename SequenceT<Device>::type& sequence,
+    const typename Tensor<int, DType>::Vector& sequence,
    size_t context_length,
    int context_start);
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename MatrixT<Device>::type* out_grad,
+    typename Tensor<real, DType>::Matrix& out_grad,
-    typename MatrixT<Device>::type* w_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
-    const typename SequenceT<Device>::type& seq_vec,
+    const typename Tensor<int, DType>::Vector& seq_vec,
    size_t context_length,
    int context_start,
    size_t total_pad,

--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -120,20 +120,19 @@ void hl_context_projection_forward(const real* input,
 }
 template <>
-void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
-                                               const GpuMatrix* input,
+                                               const GpuMatrix& input,
-                                               const GpuMatrix* weight,
+                                               const GpuMatrix& weight,
                                               const GpuIVector& sequence,
                                               size_t context_length,
                                               int context_start,
                                               size_t begin_pad) {
-  CHECK(input && output);
+  hl_context_projection_forward(input.getData(),
-  hl_context_projection_forward(input->getData(),
                                sequence.getData(),
-                                weight ? weight->getData() : nullptr,
+                                weight ? weight.getData() : nullptr,
-                                output->getData(),
+                                output.getData(),
                                sequence.getSize() - 1,
-                                input->getWidth(),
+                                input.getWidth(),
                                context_length,
                                context_start,
                                begin_pad);
@@ -217,17 +216,16 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
-                                                    GpuMatrix* in_grad,
+                                                    GpuMatrix& in_grad,
                                                    const GpuIVector& sequence,
                                                    size_t context_length,
                                                    int context_start) {
-  CHECK(in_grad && out_grad);
+  hl_context_projection_backward_data(out_grad.getData(),
-  hl_context_projection_backward_data(out_grad->getData(),
                                      sequence.getData(),
-                                      in_grad->getData(),
+                                      in_grad.getData(),
                                      sequence.getSize() - 1,
-                                      in_grad->getWidth(),
+                                      in_grad.getWidth(),
                                      context_length,
                                      context_start);
 }
@@ -348,19 +346,18 @@ void hl_context_projection_backward_weight(real* out_grad,
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix* out_grad,
+        GpuMatrix& out_grad,
-        GpuMatrix* w_grad,
+        GpuMatrix& w_grad,
        const GpuIVector& seq_vec,
        size_t context_length,
        int context_start,
        size_t total_pad,
        size_t begin_pad) {
-  CHECK(out_grad && w_grad);
+  hl_context_projection_backward_weight(out_grad.getData(),
-  hl_context_projection_backward_weight(out_grad->getData(),
                                        seq_vec.getData(),
-                                        w_grad->getData(),
+                                        w_grad.getData(),
                                        seq_vec.getSize() - 1,
-                                        w_grad->getWidth(),
+                                        w_grad.getWidth(),
                                        total_pad,
                                        context_length,
                                        context_start,
@@ -368,16 +365,15 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
-                                                GpuMatrix* in_grad,
+                                                GpuMatrix& in_grad,
-                                                GpuMatrix* w_grad,
+                                                GpuMatrix& w_grad,
                                                const GpuIVector& sequence,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad,
                                                bool is_padding,
                                                size_t total_pad) {
-    CHECK(out_grad);
    if (in_grad) {
        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
                out_grad,

--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,6 +112,8 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 /**
+ * \brief {o_0, o_1} = calc(i_0)
+ *
 * \param inputs[0] input value.
 * \param outputs[0] output value.
 * \param outputs[1] denoms.
@@ -125,27 +127,24 @@ public:
    pow_ = config.get<real>("pow");
  }
-  void calc(const Arguments& inputs,
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-            const Arguments& outputs,
+    CHECK_EQ((size_t)1, inputs.size());
-            const Arguments& inouts) override {
+    CHECK_EQ((size_t)2, outputs.size());
-    CHECK_EQ(1, static_cast<int>(inputs.size()));
-    CHECK_EQ(2, static_cast<int>(outputs.size()));
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
-    }
-    size_t samples = inputs[0].dims_[0];
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
-    size_t channels = inputs[0].dims_[1];
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
-    size_t height = inputs[0].dims_[2];
+    size_t samples = inputs[0].shape()[0];
-    size_t width = inputs[0].dims_[3];
+    size_t channels = inputs[0].shape()[1];
+    size_t height = inputs[0].shape()[2];
+    size_t width = inputs[0].shape()[3];
-    CrossMapNormal<Device>(outputs[0].getData(),
+    CrossMapNormal<Device>(outputs[0].data<real>(),
-                           outputs[1].getData(),
+                           outputs[1].data<real>(),
-                           inputs[0].getData(),
+                           inputs[0].data<real>(),
                           samples,
                           channels,
                           height,
@@ -162,6 +161,8 @@ private:
 };
 /**
+ * \brief {o_0} = calc(i_0, i_1, i_2, i_3)
+ *
 * \param inputs[0] input value.
 * \param inputs[1] output value.
 * \param inputs[2] output grad.
@@ -177,31 +178,29 @@ public:
    pow_ = config.get<real>("pow");
  }
-  void calc(const Arguments& inputs,
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-            const Arguments& outputs,
+    CHECK_EQ((size_t)4, inputs.size());
-            const Arguments& inouts) override {
+    CHECK_EQ((size_t)1, outputs.size());
-    CHECK_EQ(4, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
+    CHECK(inputs[0].shape() == inputs[3].shape());
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
+    CHECK(inputs[0].shape() == outputs[0].shape());
-      CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
+    // TODO(hedaoyuan): need support ASSIGN_TO mode.
-      CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-    }
+    size_t samples = inputs[0].shape()[0];
+    size_t channels = inputs[0].shape()[1];
-    size_t samples = inputs[0].dims_[0];
+    size_t height = inputs[0].shape()[2];
-    size_t channels = inputs[0].dims_[1];
+    size_t width = inputs[0].shape()[3];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
-    CrossMapNormalGrad<Device>(outputs[0].getData(),
+                               inputs[1].data<real>(),
-                               inputs[0].getData(),
+                               inputs[2].data<real>(),
-                               inputs[1].getData(),
+                               inputs[3].data<real>(),
-                               inputs[2].getData(),
-                               inputs[3].getData(),
                               samples,
                               channels,
                               height,

--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -76,6 +76,20 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
  return *this;
 }
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+}
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+}
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+}
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 }  // namespace paddle
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -16,57 +16,17 @@ limitations under the License. */
 #include <map>
 #include <vector>
+#include "BufferArg.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ClassRegistrar.h"
 namespace paddle {
-enum DeviceType {
+/**
-  DEVICE_TYPE_UNSPECIFIED = 0,
+ * Function Configuration.
-  DEVICE_TYPE_CPU = 1,
+ * The argument type of Function::init.
-  DEVICE_TYPE_GPU = 2,
+ * Follow-up will consider moving this data structure to Proto inside.
-};
+ */
-template <DeviceType Device>
-struct MatrixT;
-template <>
-struct MatrixT<DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-template <>
-struct MatrixT<DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-template <DeviceType Device>
-struct SequenceT;
-template <>
-struct SequenceT<DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-template <>
-struct SequenceT<DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-typedef std::vector<size_t> Dims;
-class Tensor {
-public:
-  Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {}
-  real* getData() const { return buf_; }
-  real* buf_;
-  Dims dims_;
-};
-typedef std::vector<Tensor> Arguments;
 class FuncConfig {
 public:
  union value {
@@ -86,15 +46,70 @@ protected:
  std::map<std::string, value> valueMap_;
 };
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ */
+class BufferArgs {
+public:
+  BufferArgs() {}
+  size_t size() const { return args_.size(); }
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  template <typename Tensor>
+  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
+    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+  }
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+private:
+  std::vector<BufferArgPtr> args_;
+};
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
 class FunctionBase {
 public:
  virtual ~FunctionBase() {}
  virtual void init(const FuncConfig& config) {}
-  virtual void calc(const Arguments& inputs,
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
-                    const Arguments& outputs,
-                    const Arguments& inouts) {}
  static ClassRegistrar<FunctionBase> funcRegistrar_;
 };

--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Function.h"
+#include <gtest/gtest.h>
+namespace paddle {
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+}  // namespace paddle
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+namespace paddle {
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    dims_.assign(dims);
+    numElements();
+  };
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    dims_.assign(t.dims_.begin(), t.dims_.end());
+  };
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+  size_t getElements() const { return nelements_; }
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+    return true;
+  }
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < 4 ? 4 : ndims;
+    dims_.assign(count, 1);
+  }
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+};
+}  // namespace paddle
--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "TensorShape.h"
+#include <gtest/gtest.h>
+namespace paddle {
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+}  // namespace paddle
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/math/Matrix.h"
+namespace paddle {
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+template <typename T>
+struct DataType;
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+namespace detail {
+template <typename VType, DeviceType Device>
+struct MatrixT;
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+template <typename VType, DeviceType Device>
+struct VectorT;
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+}  // namespace detail
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+};
+}  // namespace paddle
--- a/paddle/function/TensorTypeTest.cpp
+++ b/paddle/function/TensorTypeTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "TensorType.h"
+#include <gtest/gtest.h>
+namespace paddle {
+TEST(TensorType, Matrix) {
+  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
+  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.useGpu(), false);
+  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
+  EXPECT_EQ(testGpu.useGpu(), true);
+}
+TEST(TensorType, Vector) {
+  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
+  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
+  EXPECT_EQ(cpuVector.useGpu(), false);
+  EXPECT_EQ(gpuVector.useGpu(), true);
+  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100);
+  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
+  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
+  EXPECT_EQ(cpuIVector.useGpu(), false);
+  EXPECT_EQ(gpuIVector.useGpu(), true);
+  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100);
+}
+TEST(TensorType, EmptyMatrix) {
+  CpuMatrix empty(nullptr, 0, 0);
+  CpuMatrix nonEmpty(10, 10);
+  EXPECT_EQ(empty.isEmpty(), true);
+  EXPECT_EQ(nonEmpty.isEmpty(), false);
+  CHECK(nonEmpty);
+  auto function = [](const CpuMatrix& matrix) {
+    if (matrix) {
+      EXPECT_NE(matrix.getData(), nullptr);
+    } else {
+      EXPECT_EQ(matrix.getData(), nullptr);
+    }
+  };
+  function(empty);
+  function(nonEmpty);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -110,9 +110,8 @@ void ContextProjection::forward() {
  size_t input_dim = in_->value->getWidth();
  size_t dim = out_->value->getWidth();
  CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
+  // size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(static_cast<int>(forward_.size()), 1)
+  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
-      << "Only one forward function here";
  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
  bool is_padding = config_.trainable_padding();
@@ -120,14 +119,16 @@ void ContextProjection::forward() {
  auto w_ptr =
      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
  auto start_pos = in_->sequenceStartPositions;
-  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
-                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
+  BufferArgs inputs;
-                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+  BufferArgs outputs;
-                     Tensor(reinterpret_cast<real*>(
+  inputs.addArg(*in_->value);
-                                const_cast<int*>(start_pos->getData(useGpu_))),
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                            Dims{start_pos->getSize()})},
+                          w_ptr ? w_ptr->getHeight() : 0,
-                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
+                          input_dim));
-                    {});
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->value, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
  if (state_ && config_.context_start() < 0) {
    CHECK_EQ(1, in_->getNumSequences());
@@ -162,15 +163,17 @@ void ContextProjection::backward(const UpdateCallback& callback) {
  bool is_padding = config_.trainable_padding();
  auto start_pos = in_->sequenceStartPositions;
  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
-                             Dims{batch_size, input_dim}),
+  BufferArgs inputs;
-                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
+  BufferArgs outputs;
-                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+  inputs.addArg(CpuMatrix(
-                      Tensor(reinterpret_cast<real*>(
+      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
-                                 const_cast<int*>(start_pos->getData(useGpu_))),
+  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                             Dims{start_pos->getSize()})},
+                          w_ptr ? w_ptr->getHeight() : 0,
-                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
+                          input_dim));
-                     {});
+  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(*out_->grad, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
  if (config_.trainable_padding()) {
    weight_->getParameterPtr()->incUpdate(callback);

--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -59,7 +59,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
 void CMRProjectionNormLayer::forward(PassType passType) {
  Layer::forward(passType);
  /* malloc memory for the output_ if necessary */
  /* note: one sample correspond to one row */
  MatrixPtr input = inputLayers_[0]->getOutputValue();
@@ -67,34 +66,36 @@ void CMRProjectionNormLayer::forward(PassType passType) {
  int size = getSize();
  resetOutput(batchSize, size);
-  MatrixPtr outV = getOutputValue();
  Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
-  dims_ = {batchSize, channels_, imgSizeH_, imgSizeW_};
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
-  forward_[0]->calc(
-      {Tensor(input->getData(), dims_)},
+  // prepare forward arguments
-      {Tensor(outV->getData(), dims_), Tensor(denoms_->getData(), dims_)},
+  BufferArgs inputs;
-      {});
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
 }
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
  (void)callback;
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
    return;
  }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
+  // prepare backward arguments
-  MatrixPtr localGrad = getOutputGrad();
+  BufferArgs inputs;
-  MatrixPtr localOutV = getOutputValue();
+  BufferArgs outputs;
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
-  backward_[0]->calc({Tensor(preOutV->getData(), dims_),
+  inputs.addArg(*getOutputGrad(), shape_);
-                      Tensor(localOutV->getData(), dims_),
+  inputs.addArg(*denoms_, shape_);
-                      Tensor(localGrad->getData(), dims_),
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
-                      Tensor(denoms_->getData(), dims_)},
-                     {Tensor(preOutGrad->getData(), dims_)},
+  backward_[0]->calc(inputs, outputs);
-                     {});
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -41,6 +41,6 @@ public:
  void backward(const UpdateCallback& callback = nullptr);
 protected:
-  Dims dims_;
+  TensorShape shape_;
 };
 }  // namespace paddle
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1311,7 +1311,9 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
  real* w = W.getData();
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
  real* output = getData();
  hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
@@ -1324,7 +1326,9 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
  real* wgrad = data_;
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
  hl_param_relu_backward_w(
      wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
@@ -1336,7 +1340,9 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
  real* w = W.getData();
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
  hl_param_relu_backward_diff(
      ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
@@ -3764,7 +3770,9 @@ void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
  real* w = W.getData();
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
  for (size_t n = 0, k = 0; n < numSamples; ++n) {
    for (size_t i = 0; i < numElements; ++i, ++k) {
      data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
@@ -3778,7 +3786,9 @@ void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
  real* wgrad = data_;
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
  for (size_t n = 0, k = 0; n < numSamples; ++n) {
    for (size_t i = 0; i < numElements; ++i, ++k) {
      wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
@@ -3793,7 +3803,9 @@ void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
  real* w = W.getData();
  size_t numElements = data.getWidth();
  size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
  for (size_t n = 0, k = 0; n < numSamples; ++n) {
    for (size_t i = 0; i < numElements; ++i, ++k) {
      diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1091,6 +1091,10 @@ public:
      TensorCpuApply<real>(*this, expr);
    }
  }
+  bool isEmpty() const { return data_ == nullptr; }
+  explicit operator bool() const { return !isEmpty(); }
 };
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {

--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -224,10 +224,11 @@ void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
 }
 TEST(Matrix, paramRelu) {
-  for (auto height : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 100}) {
+    for (auto width : {10, 40, 100}) {
      for (auto w_height : {1, 2}) {
        for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
          testParamReluForward(height, width, w_height, w_width);
          testParamReluBackwardW(height, width, w_height, w_width);
        }

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -773,10 +773,11 @@ void testParamReluBackwardDiff(int height,
 }
 TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
-    for (auto width : {10, 100}) {
+    for (auto width : {10, 40, 100}) {
      for (auto w_height : {1, 2}) {
        for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
          testParamReluBackwardDiff(height, width, w_height, w_width);
        }
      }

--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
    BaseClient.cpp
    ParameterClient2.cpp
    ParameterServer2.cpp
-    SparseParameterDistribution.cpp)
+    SparseParameterDistribution.cpp
+    ParameterServerController.cpp)
 set(PSERVER_HEADERS
    BaseClient.h
    ParameterClient2.h
    ParameterServer2.h
-    SparseParameterDistribution.h)
+    SparseParameterDistribution.h
+    ParameterServerController.h)
 add_library(paddle_pserver STATIC
    ${PSERVER_SOURCES})

--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
-#include "paddle/utils/StringUtil.h"
+#include "ParameterServerController.h"
-#include "paddle/utils/Util.h"
-#include "ParameterServer2.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/Flags.h"
 using namespace paddle;  // NOLINT
 int main(int argc, char** argv) {
  initMain(argc, argv);
-  std::vector<std::string> devices;
+  std::unique_ptr<ParameterServerController> parameterServerPtr(
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
+      paddle::ParameterServerController::createFromGflags());
+  parameterServerPtr->start();
-  // round robin to loadbalance RDMA server ENGINE
+  parameterServerPtr->wait();
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-  if (FLAGS_nics.empty()) {
-    pservers.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
-        pservers[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      }
-      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                 << FLAGS_port + i;
-      LOG(INFO) << "pserver started : " << FLAGS_port + i;
-      pservers[i]->start();
-    }
-  } else {
-    str::split(FLAGS_nics, ',', &devices);
-    pservers.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-        }
-        CHECK(pservers[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << devices[j] << ":"
-                  << FLAGS_port + i;
-        pservers[i * devices.size() + j]->start();
-      }
-    }
-  }
-  for (auto& pserver : pservers) {
-    pserver->join();
-  }
  return 0;
 }
--- a/paddle/pserver/ParameterServerController.cpp
+++ b/paddle/pserver/ParameterServerController.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ParameterServerController.h"
+namespace paddle {
+ParameterServerController::ParameterServerController(
+    const ParameterServerConfig& config) {
+  // round robin to load balance RDMA server ENGINE
+  std::vector<std::string> devices;
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  int numPorts = config.ports_num() + config.ports_num_for_sparse();
+  if (config.nics().empty()) {
+    parameterServers_.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (config.rdma_tcp() == "rdma") {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i));
+      }
+      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
+                                             "server on port "
+                                          << config.port() + i;
+    }
+  } else {
+    str::split(config.nics(), ',', &devices);
+    parameterServers_.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (config.rdma_tcp() == "rdma") {
+          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          parameterServers_[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
+        }
+        CHECK(parameterServers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server with device " << devices[j]
+            << config.port() + i;
+      }
+    }
+  }
+}
+ParameterServerController::~ParameterServerController() { this->wait(); }
+ParameterServerController* ParameterServerController::createFromGflags() {
+  ParameterServerConfig config;
+  config.set_nics(FLAGS_nics);
+  config.set_rdma_tcp(FLAGS_rdma_tcp);
+  config.set_port(FLAGS_port);
+  config.set_ports_num(FLAGS_ports_num);
+  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
+  return create(config);
+}
+ParameterServerController* ParameterServerController::create(
+    const ParameterServerConfig& config) {
+  return new ParameterServerController(config);
+}
+void ParameterServerController::start() {
+  LOG(INFO) << "number of parameterServer instances: "
+            << parameterServers_.size();
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Starting parameterServer[" << i << "]";
+    parameterServer->start();
+    i++;
+  }
+}
+void ParameterServerController::wait() {
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Waiting parameterServer[" << i << "]";
+    parameterServer->join();
+    i++;
+  }
+}
+}  // namespace paddle
--- a/paddle/pserver/ParameterServerController.h
+++ b/paddle/pserver/ParameterServerController.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+namespace paddle {
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+public:
+  DISABLE_COPY(ParameterServerController);
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+}  // namespace paddle
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
    submit_local.sh
    @ONLY)
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
        RENAME paddle)
+configure_file(tools/usage_stat/usage.sh
+    usage.sh
+    @ONLY)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle_usage)
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -122,6 +122,9 @@ case "$1" in
    "make_diagram")
        python -m paddle.utils.make_model_diagram ${@:2}
        ;;
+    "usage")
+        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        ;;
    "version")
        version
        ;;

--- a/paddle/scripts/tools/usage_stat/usage.sh
+++ b/paddle/scripts/tools/usage_stat/usage.sh
+#!/bin/bash
+ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
+KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
+# paddle config home dir, same as paddle
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+# api url, mirror url(s) will be append later
+PD_URLS="http://api.paddlepaddle.org/version"
+usage()
+{
+    echo "Usage: `basename $0` [options]"
+    echo "Options:"
+    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
+    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
+    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
+    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
+    echo "  -v, -i                            Verbose output and interact with user when necessary"
+    echo " --help                             display this help message"
+}
+eval set -- "${ARGPARSE}"
+while true; do
+    case "$1" in
+        -l|--log-file)
+            log_file=$2
+            shift 2
+            ;;
+        -e|--exit-code)
+            exit_code=$2
+            shift 2
+            ;;
+        -u|--git-user)
+            github_user=$2
+            shift 2
+            ;;
+        -n|--task-name)
+            task=$2
+            shift 2
+            ;;
+        -v|-i)
+            v=1
+            shift
+            ;;
+        --dry-run)
+            dry_run=1
+            shift
+            ;;
+        --)
+            shift
+            break
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Invalid option $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+# parse the log_file to get the time costs
+if [ -s "${log_file}" ]; then
+    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
+    {if(index($2,":")==3){
+        t=substr($2,1,8);
+        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
+        if(sec<last_sec-600){day+=1;sec+=86400;}
+        last_sec=sec;
+        if(min_sec==0 || min_sec>sec){min_sec=sec;}
+        if(max_sec==0 || max_sec<sec){max_sec=sec;}
+    }}
+    END{print max_sec-min_sec}' ${log_file}`
+else
+    duration=-1
+fi
+if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
+# try find the user/email if not given
+if [ -z "${github_user}" ]; then
+    # search for cached username
+    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
+        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
+        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
+    else
+        # search the github-user from git config
+        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
+        git_username=`git config --get user.name 2>/dev/null`
+        git_url=`git config --get remote.origin.url 2>/dev/null`
+        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
+            # under a git url, like https://github.com/user_xxx/proj_yyy.git
+            if [ "${v}" = "1" ]; then echo " from github url..."; fi
+            github_user=`echo ${git_url} | cut -d "/" -f 4`
+            if [ "${github_user}" = "PaddlePaddle" ]; then
+                github_user=
+            fi
+        fi
+        if [ -n "${git_username}" -a -z "${github_user}" ]; then
+            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
+            github_user=${git_username}
+        fi
+    fi
+fi
+# allow user to set the user name, if it's not found
+if [ -z "${github_user}" -a "${v}" = "1" ]; then
+    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
+    github_user=${REPLY}
+    if [ -z "${github_user}" ]; then
+        # empty input, consider as one anonymous user
+        github_user="${KEEP_ANONYMOUS}"
+    fi
+fi
+if [ -n "${github_user}" -a -z "${dry_run}" ]; then
+    # valid user and not in dry-run mode, then save to cache
+    mkdir -p ${PADDLE_CONF_HOME}
+    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
+fi
+if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
+if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
+    # anonymous user should keep the var empty.
+    github_user=
+fi
+# read local paddle version
+paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
+if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
+# read local system time
+system_time=`date "+%Y%m%d%H%M%S"`
+if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
+# make empty job_name as default value.
+if [ -z "${task}" ]; then
+    task="(unknown_task)"
+fi
+if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
+# concat the curl command
+params="content={\"data_type\":\"usage\",\
+\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
+\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
+\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
+}&type=1"
+curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
+ -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
+if [ "${dry_run}" = "1" ]; then
+    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
+    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
+    exit 0
+else
+    for u in ${PD_URLS}; do
+        curl_cmd="${curl_cmd_prefix} ${u}"
+        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
+        ${curl_cmd} >/dev/null 2>&1
+        if [ $? -eq 0 ]; then
+            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
+            exit 0
+        else
+            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
+        fi
+    done
+    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
+    exit 1
+fi
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,14 +6,14 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
  export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
  export PYTHONHOME=/opt/python/2.7.12
  export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
  NRPOC=`nproc`
  make -j $NPROC
  make coveralls
  sudo make install
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
  export PYTHONPATH=/usr/local/lib/python2.7/site-packages
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON
+  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
  NPROC=`sysctl -n hw.ncpu`
  make -j $NPROC
 fi
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,3 +2,5 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
+mkdir -p $HOME/third_party
+EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 # check websites for broken links

--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -70,6 +70,6 @@ setup(name="py_paddle",
  include_dirs = include_dirs,
  install_requires = [
    'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=2.4.1' # The paddle protobuf version
+    'protobuf>=3.0.0'    # The paddle protobuf version
  ],
 )
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/pserver/ParameterServer2.h"
+#include <fenv.h>
-#include "paddle/utils/Common.h"
+#include "paddle/pserver/ParameterServerController.h"
 #include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/StringUtil.h"
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/RDMANetwork.h"
 DEFINE_bool(start_pserver, false, "Whether to start pserver");
 DECLARE_int32(gpu_id);
@@ -38,54 +36,11 @@ int main(int argc, char** argv) {
  initMain(argc, argv);
  initPython(argc, argv);
-  std::vector<std::unique_ptr<ParameterServer2>> pservers;
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
-  std::vector<std::string> devices;
  if (FLAGS_start_pserver) {
-    // round robin to loadbalance RDMA server ENGINE
+    parameterServerPtr.reset(
-    int rdmaCpu = 0;
+        paddle::ParameterServerController::createFromGflags());
-    int onlineCpus = rdma::numCpus();
+    parameterServerPtr->start();
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    if (FLAGS_nics.empty()) {
-      pservers.resize(numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i));
-        }
-        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                   << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << FLAGS_port + i;
-        pservers[i]->start();
-      }
-    } else {
-      str::split(FLAGS_nics, ',', &devices);
-      pservers.resize(devices.size() * numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        for (size_t j = 0; j < devices.size(); ++j) {
-          if (FLAGS_rdma_tcp == "rdma") {
-            pservers[i * devices.size() + j].reset(new ParameterServer2(
-                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-            rdmaCpu = rdmaCpu % onlineCpus;
-          } else {
-            pservers[i * devices.size() + j].reset(
-                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-          }
-          CHECK(pservers[i * devices.size() + j]->init())
-              << "Fail to initialize parameter server" << devices[j]
-              << FLAGS_port + i;
-          LOG(INFO) << "pserver started : " << devices[j] << ":"
-                    << FLAGS_port + i;
-          pservers[i * devices.size() + j]->start();
-        }
-      }
-    }
  }
  Trainer trainer;
  auto config = TrainerConfigHelper::createFromFlags();

--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -4,7 +4,8 @@ set(proto_filenames
    ModelConfig.proto
    ParameterConfig.proto
    ParameterService.proto
-    TrainerConfig.proto)
+    TrainerConfig.proto
+    ParameterServerConfig.proto)
 set(PROTO_GEN)
 set(PROTO_GEN_PY)

--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+package paddle;
+/**
+ * Configuration structure for ParameterClient2.
+ */
+message ParameterClientConfig {
+  required int32 trainer_id = 1;
+}
+/**
+ * Configuration structure for ParameterServer2.
+ */
+message ParameterServerConfig {
+  // The ports number for parameter send,
+  // increment based on default port number
+  required int32 ports_num = 1 [default = 1];
+  // The ports number for parameter send,
+  // increment based on default (port + ports_num
+  required int32 ports_num_for_sparse = 2 [default = 0];
+  // network device name for pservers
+  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string rdma_tcp = 4 [default = "tcp"];
+  // Listening port for pserver
+  required int32 port = 5 [default = 20134];
+  // number of gradient servers
+  required int32 num_gradient_servers = 6 [default = 1];
+  // number of threads for sync op exec
+  required int32 pserver_num_threads = 7 [default = 1];
+  // control config_.async_lagged_grad_discard_ratio() min value
+  required double async_lagged_ratio_min = 8 [default = 1.0];
+  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
+  // use it as defalut value
+  required double async_lagged_ratio_default = 9 [default = 1.5];
+}
\ No newline at end of file