diff --git a/.dockerignore b/.dockerignore
deleted file mode 120000
index 3e4e48b0b5fe6b468434d6767749b399319f2da2..0000000000000000000000000000000000000000
--- a/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-.gitignore
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..2b2e74053d33cb6d2878fd3d6da48fa344172f63
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,15 @@
+*.DS_Store
+build/
+*.user
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+Makefile
+.test_env/
+third_party/
+*~
+bazel-*
+
+!build/*.deb
diff --git a/.gitignore b/.gitignore
index 6aae076a49012b032b8fc0f1dc02c2714fb7b4a3..2b30f7938c8a1672acd0a14b7051af12c37889fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,5 +1,6 @@
 *.DS_Store
 build/
+build_doc/
 *.user
 
 .vscode
@@ -7,6 +8,7 @@ build/
 .project
 .cproject
 .pydevproject
+.settings/
 Makefile
 .test_env/
 third_party/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a6e45028ebc3f53ea20806f0dd2a7acc820607fe..9b138576fcc695408c4cc0a03d227da7d0c6f440 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,21 +1,21 @@
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    sha: c25201a00e6b0514370501050cf2a8538ac12270
+    sha: v1.0.1
     hooks:
     -   id: remove-crlf
-        files: (?!.*third_party)^.*$
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
-    - id: yapf
-      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$  # Bazel BUILD files follow Python syntax.
+    -   id: yapf
+        files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
+    sha: 5bf6c09bfa1297d3692cadd621ef95f1284e33c0
     hooks:
     -   id: check-added-large-files
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
-        files: (?!.*third_party)^.*$
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index bc91855a8571985a386b698e7ecd43bad20477ac..d73fd39aa7a2ee87c0e31436ffc14df2213134c9 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,39 +1,38 @@
 language: cpp
-cache: ccache
+cache:
+  directories:
+    - $HOME/third_party
+    - $HOME/.ccache
+    - $HOME/.cache/pip
 sudo: required
 dist: trusty
 os:
   - linux
-  - osx
 env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
   - JOB=PRE_COMMIT
-matrix:
-  exclude:
-    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux.
-    - os: osx
-      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux
-
 addons:
   apt:
     packages:
       - gcc-4.8
       - g++-4.8
-      - wget
+      - gfortran-4.8
       - git
       - build-essential
-      - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
+      - python-numpy
+      - python-wheel
+      - libboost-dev
       - curl
       - swig
       - graphviz
       - clang-format-3.8
       - automake
       - libtool
+      - ccache
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
@@ -46,12 +45,16 @@ before_install:
         fi
       fi
     fi
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install --upgrade pip
-  - pip install wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # protobuf version.
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
+  - |
+    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
-  - paddle/scripts/travis/main.sh
+  - | 
+    timeout 2580 paddle/scripts/travis/main.sh  # 43min timeout
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
   email:
     on_success: change
diff --git a/AUTHORS.md b/AUTHORS.md
new file mode 100644
index 0000000000000000000000000000000000000000..d5baee2161aa1d5360056e03ca67d5b2fe9ff7d2
--- /dev/null
+++ b/AUTHORS.md
@@ -0,0 +1,28 @@
+| Github account | name |
+|---|---|
+| reyoung | Yang Yu |
+| gangliao | Gang Liao |
+| luotao01 | Tao Luo |
+| jacquesqiao | Long-Fei Qiao |
+| qingqing01 | Qing-Qing Dang |
+| hedaoyuan | Dao-Yuan He |
+| wangyang59 | Yang Wang |
+| QiJune | Jun Qi |
+| tianbingsz | Tian-Bing Xu |
+| cxwangyi, yiwangbaidu, wangkuiyi | Yi Wang |
+| typhoonzero | Yi Wu |
+| backyes | Yan-Fei Wang |
+| pengli09 | Peng Li |
+| livc | Zhao Li |
+| Xreki | Yi-Qun Liu |
+| Yancey1989 | Xu Yan |
+| emailweixu | Wei Xu |
+| wen-bo-yang | Wen-Bo Yang |
+| helinwang | He-Lin Wang |
+| lcy-seso | Ying Cao |
+| Zrachel | Rui-Qing Zhang |
+| Haichao-Zhang | Hai-Chao Zhang |
+| gongweibao | Wei-Bao Gong |
+| lzhao4ever | Liang Zhao |
+| zhouxiao-coder | Xiao Zhou |
+| lipeng-unisound | Peng Li |
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ed757bd1bfbd23ca24445c15e7cf8e13860d26f..aa4f1eaff9125f2ff11a6ef83e503acd56b79e21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,48 +1,89 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License
 
-cmake_minimum_required(VERSION 3.0)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
+set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
 
-project(paddle CXX C)
+include(system)
+
+if(ANDROID)
+    cmake_minimum_required(VERSION 3.7)
+else()
+    cmake_minimum_required(VERSION 3.0)
+endif()
 
-set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
+project(paddle CXX C)
 
 find_package(Sphinx)
-find_package(CUDA QUIET)
+if(NOT CMAKE_CROSSCOMPILING)
+    find_package(CUDA QUIET)
+endif(NOT CMAKE_CROSSCOMPILING)
 find_package(Git REQUIRED)
 find_package(Threads REQUIRED)
 
-include(system)
 include(simd)
 
-###################### Configurations ############################
-option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
-option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
-option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
-option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
-option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ON)
-option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
-option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
-option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ON)
-option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
-option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ON)
-option(ON_TRAVIS "Running test on travis-ci or not." OFF)
-option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
-option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
+################################ Configurations #######################################
+option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
+option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
+option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
+option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
+option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
+option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
+option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
+option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
+
+# CMAKE_BUILD_TYPE
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
+endif()
+
+if(ANDROID)
+    if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+        message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 21")
+    endif()
+
+    set(WITH_GPU OFF CACHE STRING
+        "Disable GPU when cross-compiling for Android" FORCE)
+    set(WITH_AVX OFF CACHE STRING
+        "Disable AVX when cross-compiling for Android" FORCE)
+    set(WITH_PYTHON OFF CACHE STRING
+        "Disable PYTHON when cross-compiling for Android" FORCE)
+    set(WITH_RDMA OFF CACHE STRING
+        "Disable RDMA when cross-compiling for Android" FORCE)
+endif(ANDROID)
+
+set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+  "A path setting third party libraries download & build directories.")
+
+if (WITH_C_API AND WITH_PYTHON)
+  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
+    "when using C-API. It will give an unpredictable behavior when using a "
+    "different Python interpreter from compiling.")
+endif()
+
+########################################################################################
 
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
@@ -53,6 +94,7 @@ include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
+include(external/any)       # download libn::any
 
 include(package)            # set paddle packages
 include(cpplint)            # set paddle c++ style
@@ -63,8 +105,6 @@ include(flags)              # set paddle compile flags
 include(cudnn)              # set cudnn libraries
 include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
-include(python_module)      # set python module
-
 include(configure)          # add paddle env configuration
 
 include_directories("${PROJ_ROOT}")
@@ -72,14 +112,21 @@ include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 
 set(EXTERNAL_LIBS
-    # have not include gtest here.
     ${GFLAGS_LIBRARIES}
     ${GLOG_LIBRARIES}
     ${CBLAS_LIBRARIES}
     ${PROTOBUF_LIBRARY}
     ${ZLIB_LIBRARIES}
+    ${PYTHON_LIBRARIES}
 )
 
+if(WITH_GPU)
+    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    if(NOT WITH_DSO)
+        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(NOT WITH_DSO)
+endif(WITH_GPU)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..ad0d086d3c65b5901178aa681aa36ccc0ea0c246
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,72 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+FROM nvidia/cuda:8.0-cudnn5-devel-ubuntu16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_STYLE_CHECK
+
+ENV WOBOQ OFF
+ENV WITH_GPU=${WITH_GPU:-OFF}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+ENV HOME /root
+# Add bash enhancements
+COPY ./paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-pip python-dev openssh-server bison  \
+    wget unzip tar xz-utils bzip2 gzip coreutils  \
+    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
+    python-numpy python-matplotlib gcc g++ \
+    automake locales clang-format-3.8 swig doxygen cmake  \
+    liblapack-dev liblapacke-dev libboost-dev \
+    clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    apt-get clean -y
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter
+# version util jupyter fixes this issue.
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
+    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0'
+
+# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
+# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
+RUN apt-get install -y libssl-dev libffi-dev
+RUN pip install certifi urllib3[secure]
+
+# Install woboq_codebrowser to /woboq
+RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
+    (cd /woboq \
+     cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+           -DCMAKE_BUILD_TYPE=Release . \
+     make)
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+EXPOSE 22
+
+# development image default do build work
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/README.md b/README.md
index 8a8e15841586ae6a01bb93e94f6074189f556f5a..bcc24b84128df282a2e3f0bc62aafe1ffe172338 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -59,36 +59,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     the capability of PaddlePaddle to make a huge impact for your product.
 
 ## Installation
-Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or
-directly build on **Linux** and **Mac OS X** from the source code.
+
+It is recommended to check out the
+[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+before looking into the
+[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
-Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
 
-- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
-   You can follow the quick start tutorial to learn how use PaddlePaddle
-   step-by-step.
+We provide [English](http://www.paddlepaddle.org/develop/doc/) and
+[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+
+- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+
+  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+
+- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+
+  You can run distributed training jobs on MPI clusters.
+
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
-- [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
-   We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling.
+   You can also run distributed training jobs on Kubernetes clusters.
 
-- [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
-  This system supports training deep learning models on multiple machines
-  with data parallelism.
+- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
 
-- [Python API](http://paddlepaddle.org/doc/ui/) <br>
-   PaddlePaddle supports using either Python interface or C++ to build your
-   system. We also use SWIG to wrap C++ source code to create a user friendly
-   interface for Python. You can also use SWIG to create interface for your
-   favorite programming language.
+   Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
-   We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.
+- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
-- [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
+   We appreciate your contributions!
 
 ## Ask Questions
 
diff --git a/RELEASE.cn.md b/RELEASE.cn.md
new file mode 100755
index 0000000000000000000000000000000000000000..5deaf230a8f5dd3089993f0fc79b9460fd049750
--- /dev/null
+++ b/RELEASE.cn.md
@@ -0,0 +1,80 @@
+# v0.10.0版本
+
+我们非常高兴发布了PaddlePaddle V0.10.0版，并开发了新的[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
+
+- 旧的Python API由于难以学习和使用已经过时了。使用旧版本的API至少需要两份python文件，分别是定义数据生成器和定义网络拓扑结构的文件。用户通过运行`paddle_trainer`的C++程序来启动PaddlePaddle任务，该程序调用Python解释器来运行定义网络拓扑结构的文件，然后通过迭代加载数据生成器提供的小批量数据启动训练循环。这与Python的现代编辑方式不符，比如Jupyter Notebook。
+
+- 新版的API被称为 *V2 API*，允许我们在单个.py文件中，通过编辑更短的Python程序来定义网络结构和数据。此外，该Python程序也可以在Jupyter Notebook中运行，因为PaddlePaddle可以作为共享库来被Python程序加载和使用。
+
+基于新的API，我们提供了一个在线的学习文档 [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) 及其[中文版本](http://book.paddlepaddle.org/)。
+
+我们还致力于迭代更新新版API的在线文档，并将新版API引入分布式集群（包括MPI和Kubernetes）训练中。我们将在下一个版本中发布更多的内容。
+
+## 新特点
+
+* 发布新版[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/)。
+* 发布深度学习系列课程 [Deep Learning 101](http://book.paddlepaddle.org/index.en.html) 及其[中文版本](http://book.paddlepaddle.org/)。
+* 支持矩形输入的CNN。
+* 为seqlastin和seqfirstin提供stride pooling。
+* 在`trainer_config_helpers`中暴露`seq_concat_layer/seq_reshape_layer`。
+* 添加公共数据集包：CIFAR，MNIST，IMDB，WMT14，CONLL05，movielens，imikolov。
+* 针对Single Shot Multibox Detection增加 Prior box layer。
+* 增加光滑的L1损失。
+* 在V2 API中增加 data reader 创建器和修饰器。
+* 增加cmrnorm投影的CPU实现。
+
+
+## 改进
+
+* 提供`paddle_trainer`的Python virtualenv支持。
+* 增加代码自动格式化的pre-commit hooks。
+* 升级protobuf到3.x版本。
+* 在Python数据生成器中提供一个检测数据类型的选项。
+* 加速GPU中average层的后向反馈计算。
+* 细化文档。
+* 使用Travis-CI检查文档中的死链接。
+* 增加解释`sparse_vector`的示例。
+* 在layer_math.py中添加ReLU。
+* 简化Quick Start示例中的数据处理流程。
+* 支持CUDNN Deconv。
+* 在v2 API中增加数据feeder。
+* 在情感分析示例的演示中增加对标准输入流中样本的预测。
+* 提供图像预处理的多进程接口。
+* 增加V1 API的基准文档。
+* 在`layer_math.py`中增加ReLU。
+* 提供公共数据集的自动下载包。
+* 将`Argument::sumCost`重新命名为`Argument::sum`，并暴露给python。
+* 为矩阵相关的表达式评估增加一个新的`TensorExpression`实现。
+* 增加延迟分配来优化批处理多表达式计算。
+* 增加抽象的类函数及其实现：
+  * `PadFunc` 和 `PadGradFunc`。
+  * `ContextProjectionForwardFunc` 和 `ContextProjectionBackwardFunc`。
+  * `CosSimBackward` 和 `CosSimBackwardFunc`。
+  * `CrossMapNormalFunc` 和 `CrossMapNormalGradFunc`。
+  * `MulFunc`。
+* 增加`AutoCompare`和`FunctionCompare`类，使得编写比较gpu和cpu版本函数的单元测试更容易。
+* 生成`libpaddle_test_main.a`并删除测试文件内的主函数。
+* 支持PyDataProvider2中numpy的稠密向量。
+* 清理代码库，删除一些复制粘贴的代码片段：
+  * 增加`SparseRowMatrix`的抽样类`RowBuffer`。
+  * 清理`GradientMachine`的接口。
+  * 在layer中增加`override`关键字。
+  * 简化`Evaluator::create`，使用`ClassRegister`来创建`Evaluator`。
+* 下载演示的数据集时检查MD5校验。
+* 添加`paddle::Error`，用于替代Paddle中的`LOG(FATAL)`。
+
+
+## 错误修复
+
+* 检查`recurrent_group`的layer输入类型。
+* 不要用.cu源文件运行`clang-format`。
+* 修复`LogActivation`的使用错误。
+* 修复运行`test_layerHelpers`多次的错误。
+* 修复seq2seq示例超出消息大小限制的错误。
+* 修复在GPU模式下dataprovider转换的错误。
+* 修复`GatedRecurrentLayer`中的错误。
+* 修复在测试多个模型时`BatchNorm`的错误。
+* 修复paramRelu在单元测试时崩溃的错误。
+* 修复`CpuSparseMatrix`编译时相关的警告。
+* 修复`MultiGradientMachine`在`trainer_count > batch_size`时的错误。
+* 修复`PyDataProvider2`阻止异步加载数据的错误。
diff --git a/RELEASE.md b/RELEASE.md
index a8a245ab442ba0fc63d1f1fda932e7590a6fe4ca..146f7afa7dfbc152500b82fde28445ae3155c16c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,3 +1,104 @@
+# Release v0.10.0
+
+We are glad to release version 0.10.0.  In this version, we are happy to release the new 
+[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+
+- Our old Python API is kind of out of date.  It's hard to learn and hard to
+  use.  To write a PaddlePaddle program using the old API, we'd have to write
+  at least two Python files: one `data provider` and another one that defines
+  the network topology.  Users start a PaddlePaddle job by running the
+  `paddle_trainer` C++ program, which calls Python interpreter to run the
+  network topology configuration script and then start the training loop,
+  which iteratively calls the data provider function to load minibatches.
+  This prevents us from writing a Python program in a modern way, e.g., in the
+  Jupyter Notebook.
+  
+- The new API, which we often refer to as the *v2 API*, allows us to write
+  much shorter Python programs to define the network and the data in a single
+  .py file.  Also, this program can run in Jupyter Notebook, since the entry
+  point is in Python program and PaddlePaddle runs as a shared library loaded
+  and invoked by this Python program.
+  
+Basing on the new API, we delivered an online interative
+book, [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+and [its Chinese version](http://book.paddlepaddle.org/).
+
+We also worked on updating our online documentation to describe the new API.
+But this is an ongoing work.  We will release more documentation improvements
+in the next version.
+
+We also worked on bring the new API to distributed model training (via MPI and
+Kubernetes).  This work is ongoing. We will release more about it in the next
+version.
+
+## New Features
+
+* We release [new Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+* Deep Learning 101 book in [English](http://book.paddlepaddle.org/index.en.html) and [Chinese](http://book.paddlepaddle.org/).
+* Support rectangle input for CNN.
+* Support stride pooling for seqlastin and seqfirstin.
+* Expose `seq_concat_layer/seq_reshape_layer` in `trainer_config_helpers`.
+* Add dataset package: CIFAR, MNIST, IMDB, WMT14, CONLL05, movielens, imikolov.
+* Add Priorbox layer for Single Shot Multibox Detection. 
+* Add smooth L1 cost.
+* Add data reader creator and data reader decorator for v2 API.
+* Add the CPU implementation of cmrnorm projection.
+
+## Improvements
+
+* Support Python virtualenv for `paddle_trainer`.
+* Add pre-commit hooks, used for automatically format our code.
+* Upgrade protobuf to version 3.x.
+* Add an option to check data type in Python data provider.
+* Speedup the backward of average layer on GPU.
+* Documentation refinement.
+* Check dead links in documents using Travis-CI.
+* Add a example for explaining `sparse_vector`.
+* Add ReLU in layer_math.py
+* Simplify data processing flow for Quick Start.
+* Support CUDNN Deconv.
+* Add data feeder in v2 API.
+* Support predicting the samples from sys.stdin for sentiment demo.
+* Provide multi-proccess interface for image preprocessing. 
+* Add benchmark document for v1 API.
+* Add ReLU in `layer_math.py`.
+* Add packages for automatically downloading public datasets.
+* Rename `Argument::sumCost` to `Argument::sum` since class `Argument` is nothing with cost.
+* Expose Argument::sum to Python
+* Add a new `TensorExpression` implementation for matrix-related expression evaluations.
+* Add lazy assignment for optimizing the calculation of a batch of multiple expressions.
+* Add abstract calss `Function` and its implementation:
+  * `PadFunc` and `PadGradFunc`.
+  * `ContextProjectionForwardFunc` and `ContextProjectionBackwardFunc`.
+  * `CosSimBackward` and `CosSimBackwardFunc`.
+  * `CrossMapNormalFunc` and `CrossMapNormalGradFunc`.
+  * `MulFunc`.
+* Add class `AutoCompare` and `FunctionCompare`, which make it easier to write unit tests for comparing gpu and cpu version of a function.
+* Generate `libpaddle_test_main.a` and remove the main function inside the test file.
+* Support dense numpy vector in PyDataProvider2.
+* Clean code base, remove some copy-n-pasted code snippets:
+  * Extract `RowBuffer` class for `SparseRowMatrix`.
+  * Clean the  interface of `GradientMachine`.
+  * Use `override` keyword in layer.
+  * Simplify `Evaluator::create`, use `ClassRegister` to create `Evaluator`s.
+* Check MD5 checksum when downloading demo's dataset.
+* Add `paddle::Error` which intentially replace `LOG(FATAL)` in Paddle.
+
+## Bug Fixes
+
+* Check layer input types for `recurrent_group`.
+* Don't run `clang-format` with .cu source files.
+* Fix bugs with `LogActivation`.
+* Fix the bug that runs `test_layerHelpers` multiple times.
+* Fix the bug that the seq2seq demo exceeds protobuf message size limit.
+* Fix the bug in dataprovider converter in GPU mode.
+* Fix a bug in `GatedRecurrentLayer`.
+* Fix bug for `BatchNorm` when testing more than one models.
+* Fix broken unit test of paramRelu.
+* Fix some compile-time warnings about `CpuSparseMatrix`.
+* Fix `MultiGradientMachine` error when `trainer_count > batch_size`.
+* Fix bugs that prevents from asynchronous data loading in `PyDataProvider2`.
+
 # Release v0.9.0
 
 ## New Features:
diff --git a/authors b/authors
deleted file mode 100644
index ab4d3118ff1f7e94677c89073c4ea05bf991165e..0000000000000000000000000000000000000000
--- a/authors
+++ /dev/null
@@ -1,53 +0,0 @@
-Cao, Ying
-Cheng, Yujuan
-Dang, Qingqing
-Dong, Tengfei
-Du, Dalong
-Feng, Shouqiang
-Gao, Haoyuan
-Han, Baochang
-Han, Jinchen
-Hao, Nanyu
-He, Daoyuan
-He, Zhengyan
-Hou, Jue
-Huang, Chang
-Huang, Zhiheng
-Hu, Na
-Kong, Qi
-Liao, Gang
-Li, Bo
-Li, Jiajie
-Li, Jing
-Li, Lei
-Li, Peng
-Liu, Sheng
-Liu, Yuan
-Li, Yuze
-Luo, Heng
-Luo, Tao
-Lyu, Qin
-Mao, Hongyue
-Qian, Xiaojun
-Qi, Jun
-Qin, Duohao
-Shen, Guolong
-Shi, Guangchuan
-Song, Xiang
-Wang, Jiang
-Wang, Yanfei
-Wang, Yong
-Weng, Renliang
-Xu, Tianbing
-Xu, Wei
-Xu, Xingyu
-Yan, Chong
-Yan, Chunwei
-Yang, Yi
-Yu, Yang
-Yu, Yinan
-Zhang, Jian
-Zhang, Ruiqing
-Zhang, Weide
-Zhao, Liang
-Zhou, Jie
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index d319442ef10b38b9edf5844e5540a92c7094c7ce..f74cd4ff8c9c2c52319b18ac37264167b3718eae 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
+    COMMAND cd ${destination} && ln -sf ./index_*.html index.html
     )
 
   set_property(
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 4e1ae7dc81231943c4bf3db4d4ac6f073f4fd1c4..913f711afff3b8f9f77b8da978a3b9e7165d0077 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -5,7 +5,7 @@
 # If any cblas implementation found, the following variable will be set.
 #    CBLAS_PROVIDER  # one of MKL, ATLAS, OPENBLAS, REFERENCE
 #    CBLAS_INC_DIR   # the include directory for cblas.
-#    CBLAS_LIBS      # a list of libraries should be linked by paddle. 
+#    CBLAS_LIBS      # a list of libraries should be linked by paddle.
 #                    # Each library should be full path to object file.
 #
 # User should set one of MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT, REFERENCE_CBLAS_ROOT
@@ -16,11 +16,12 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 
-find_path(MKL_INCLUDE_DIR mkl.h PATHS
+find_path(MKL_INC_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
-find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
   ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
   ${MKL_ROOT}/lib
@@ -32,17 +33,18 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
   ${MKL_ROOT}/lib
   ${MKL_ROOT}/lib/intel64)
 
-
-if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
+if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
+  set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER MKL)
-  set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
-  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
-          ${MKL_SEQUENTIAL_LIB}
-          ${MKL_CORE_LIB})
+  set(CBLAS_INC_DIR ${MKL_INC_DIR} ${MKL_LAPACK_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64} ${MKL_SEQUENTIAL_LIB} ${MKL_CORE_LIB})
+
   add_definitions(-DPADDLE_USE_MKL)
-  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  set(CBLAS_FOUND ON)
-  return() # return file.
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found MKL (include: ${MKL_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
+  return()
 endif()
 
 ## Then find atlas.
@@ -58,22 +60,26 @@ set(ATLAS_LIB_SEARCH_PATHS
         /usr/lib/atlas
         /usr/lib/atlas-base   # special for ubuntu 14.04.
     )
-find_path(ATLAS_INC_DIR NAMES cblas.h 
+find_path(ATLAS_INC_DIR NAMES cblas.h
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
-find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
+find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
+find_library(ATLAS_CLAPACK_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 
-if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
+if(ATLAS_CLAPACK_INC_DIR AND ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_CLAPACK_LIB)
+  set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
-  add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  set(CBLAS_FOUND ON)
+  set(CBLAS_LIBRARIES ${ATLAS_CLAPACK_LIB} ${ATLAS_CBLAS_LIB})
+
+  add_definitions(-DPADDLE_USE_ATLAS)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found ATLAS (include: ${ATLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
   return()
 endif()
 
@@ -98,12 +104,17 @@ find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
 find_library(OPENBLAS_LIB NAMES openblas
   PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
-if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
+if(OPENBLAS_LAPACKE_INC_DIR AND OPENBLAS_INC_DIR AND OPENBLAS_LIB)
+  set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER OPENBLAS)
-  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
+  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR} ${OPENBLAS_LAPACKE_INC_DIR})
   set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
-  set(CBLAS_FOUND ON)
+
+  add_definitions(-DPADDLE_USE_OPENBLAS)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found OpenBLAS (include: ${OPENBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
   return()
 endif()
 
@@ -111,7 +122,7 @@ endif()
 ## Then find the reference-cblas.  www.netlib.org/blas/
 
 
-set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH 
+set(REFERENCE_CBLAS_ROOT $ENV{REFERENCE_CBLAS_ROOT} CACHE PATH
   "Folder contains reference-cblas")
 set(REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS
   ${REFERENCE_CBLAS_ROOT}/include
@@ -132,9 +143,10 @@ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
 if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
+  set(CBLAS_FOUND ON)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
   set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
-  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
-  set(CBLAS_FOUND ON)
+  add_definitions(-DPADDLE_USE_REFERENCE_CBLAS)
+  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
 endif()
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
index 968d41801d73c4082d2673efe415c1cdd0305b5e..900f59d4cb83bc9ce1893b2d3bd95f5a08b164bb 100644
--- a/cmake/ccache.cmake
+++ b/cmake/ccache.cmake
@@ -1,9 +1,9 @@
 # Use ccache if found ccache program
 
-find_program(CCACHE_FOUND ccache)
+find_program(CCACHE_PATH ccache)
 
-if(CCACHE_FOUND)
+if(CCACHE_PATH)
     message(STATUS "Ccache is founded, use ccache to speed up compile.")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
-endif(CCACHE_FOUND)
\ No newline at end of file
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
+endif(CCACHE_PATH)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index ae0ec01d94da49f23b56f7d34f862ca57fb39b18..5e507e78f74eee885922f502f35e3c15fafb622d 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -12,6 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+if(NOT WITH_PYTHON)
+    add_definitions(-DPADDLE_NO_PYTHON)
+endif(NOT WITH_PYTHON)
+
 if(WITH_DSO)
     add_definitions(-DPADDLE_USE_DSO)
 endif(WITH_DSO)
@@ -28,6 +32,14 @@ if(NOT WITH_PROFILER)
     add_definitions(-DPADDLE_DISABLE_PROFILER)
 endif(NOT WITH_PROFILER)
 
+if(NOT CMAKE_CROSSCOMPILING)
+    if(WITH_AVX AND AVX_FOUND)
+        set(SIMD_FLAG ${AVX_FLAG})
+    elseif(SSE3_FOUND)
+        set(SIMD_FLAG ${SSE3_FLAG})
+    endif()
+endif()
+
 if(NOT WITH_GPU)
     add_definitions(-DPADDLE_ONLY_CPU)
     add_definitions(-DHPPL_STUB_FUNC)
@@ -44,21 +56,12 @@ else()
         message(FATAL_ERROR "Paddle need cudnn to compile")
     endif()
 
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
+    set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SIMD_FLAG}")
 
     # Include cuda and cudnn
     include_directories(${CUDNN_INCLUDE_DIR})
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 9be7643819efdde3f42e4d39b2849ecc17e0d9fb..ca1471cabb57c0795ee193493d2e60bb5bd9e1cc 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -61,7 +61,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
     endif()
 endfunction()
 
-if(ON_COVERALLS)
+if(WITH_COVERAGE)
     set(CMAKE_BUILD_TYPE "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index ae3530c3a0eeb79ddbcbf4f2e99be75aa7968a2f..4641184fcf5273b884524d9b9444209ffb65e000 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -110,14 +110,13 @@ endmacro()
 
 # Get the coverage data.
 file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
-message("GCDA files:")
+message("Process GCDA files:")
+message("===============================")
 
 # Get a list of all the object directories needed by gcov
 # (The directories the .gcda files and .o files are found in)
 # and run gcov on those.
 foreach(GCDA ${GCDA_FILES})
-	message("Process: ${GCDA}")
-	message("------------------------------------------------------------------------------")
 	get_filename_component(GCDA_DIR ${GCDA} PATH)
 
 	#
@@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
@@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
 	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
 
 	# Generate the final JSON for this file.
-	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
 	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
 	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
 endforeach()
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 38c636b30edc0af1c07255814e8bc2b1ad9514da..02a5c0b2c9be782c459a255c6ffd6ba6441f2693 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -34,7 +34,7 @@ set(IGNORE_PATTERN
 #
 # first argument: target name to attach
 # rest arguments: source list to check code style.
-# 
+#
 # NOTE: If WITH_STYLE_CHECK is OFF, then this macro just do nothing.
 macro(add_style_check_target TARGET_NAME)
     if(WITH_STYLE_CHECK)
@@ -48,13 +48,17 @@ macro(add_style_check_target TARGET_NAME)
                 if(filename MATCHES ${pattern})
                     message(STATUS "DROP LINT ${filename}")
                     set(LINT OFF)
-                endif() 
+                endif()
             endforeach()
             if(LINT MATCHES ON)
-                add_custom_command(TARGET ${TARGET_NAME}
+                get_filename_component(base_filename ${filename} NAME)
+                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
+                add_custom_command(OUTPUT ${CUR_GEN}
                     PRE_BUILD
                     COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}" ${filename}
+                                "--filter=${STYLE_FILTER}"
+                                "--write-success=${CUR_GEN}" ${filename}
+                    DEPENDS ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
         endforeach()
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index e5b59be19369d3ba3e852624426b95ae365e7357..af9be86961833dcd62371227165d411a3b61d79e 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -1,3 +1,7 @@
+if(NOT WITH_GPU)
+    return()
+endif()
+
 set(CUDNN_ROOT "" CACHE PATH "CUDNN ROOT")
 find_path(CUDNN_INCLUDE_DIR cudnn.h
     PATHS ${CUDNN_ROOT} ${CUDNN_ROOT}/include
@@ -11,6 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
     ${CUDNN_ROOT}
     ${CUDNN_ROOT}/lib64
     ${CUDNN_ROOT}/lib
+    ${CUDNN_ROOT}/lib/x86_64-linux-gnu
     $ENV{CUDNN_ROOT}
     $ENV{CUDNN_ROOT}/lib64
     $ENV{CUDNN_ROOT}/lib
diff --git a/cmake/external/any.cmake b/cmake/external/any.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..8116f235d535917c03deb646ff4ec083a0cdadc7
--- /dev/null
+++ b/cmake/external/any.cmake
@@ -0,0 +1,20 @@
+INCLUDE(ExternalProject)
+
+SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
+
+INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/linb_any)
+
+ExternalProject_Add(
+    linb_any
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/thelink2012/any.git"
+    GIT_TAG         "8fef1e93710a0edf8d7658999e284a1142c4c020"
+    PREFIX          ${ANY_SOURCE_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_COMMAND     ""
+    INSTALL_COMMAND   ""
+    TEST_COMMAND      ""
+)
+
+add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index d38b7d1ba2a74d5bb46d0c07e3abe6832d4c8af3..0afb3ab9af48046af01f03838eefa0bd2fcb2821 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(GFLAGS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
-SET(GFLAGS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gflags)
+SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
 IF(WIN32)
     set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
@@ -31,9 +31,17 @@ ExternalProject_Add(
     GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GFLAGS_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index bec69f3ddf093b62f084f9080fa1fe4398c93e9a..4a9e2ecc6bbe74c5856a55fb0c982777d7ac25b7 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(GLOG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/glog)
-SET(GLOG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/glog)
+SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
 
 IF(WIN32)
@@ -29,13 +29,23 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 ExternalProject_Add(
     glog
     ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS gflags
     GIT_REPOSITORY  "https://github.com/google/glog.git"
     PREFIX          ${GLOG_SOURCES_DIR}
     UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DWITH_GFLAGS=ON
+    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
     CMAKE_ARGS      -DBUILD_TESTING=OFF
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 2fcb7893fa30e7fcd84b9e860217f82cf01bf89e..49c7d71443cda700a14af6be65ff6658eec7229f 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -16,8 +16,8 @@ IF(WITH_TESTING)
     ENABLE_TESTING()
     INCLUDE(ExternalProject)
 
-    SET(GTEST_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gtest)
-    SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gtest)
+    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
+    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
     SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
 
     INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
@@ -41,11 +41,19 @@ IF(WITH_TESTING)
         GIT_TAG         "release-1.8.0"
         PREFIX          ${GTEST_SOURCES_DIR}
         UPDATE_COMMAND  ""
-        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+        CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR}
         CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
         CMAKE_ARGS      -DBUILD_GMOCK=ON
         CMAKE_ARGS      -Dgtest_disable_pthreads=ON
         CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+        CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+        CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+                         -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                         -DCMAKE_BUILD_TYPE:STRING=Release
     )
     LIST(APPEND external_project_dependencies gtest)
 ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 677999cc9f5d320b4ac18fe0cc0d67a8e9921f8f..18ac74aa6f7531c4001fe91960f8332619c99342 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,8 +17,8 @@ INCLUDE(cblas)
 IF(NOT ${CBLAS_FOUND})
     INCLUDE(ExternalProject)
 
-    SET(CBLAS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
-    SET(CBLAS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/openblas)
+    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
+    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
     SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
 
     IF(WIN32)
@@ -30,17 +30,17 @@ IF(NOT ${CBLAS_FOUND})
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
-        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_LAPACK=1 DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
     )
-
     LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 2f2769b4c628d8570c335d344cbf608bda84206f..a9db4e8ba410c718f1ee4d69f4551e2773c60125 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -14,49 +14,67 @@
 
 INCLUDE(ExternalProject)
 
-SET(PROTOBUF_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
-SET(PROTOBUF_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/protobuf)
-SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+set(PROTOBUF_VERSION 3.1)
+FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 
-INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+IF(PROTOBUF_FOUND)
+    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
+    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+    IF ("${PROTOBUF_VERSION}" VERSION_LESS "3.1.0")
+        SET(PROTOBUF_FOUND OFF)
+    ENDIF()
+ENDIF(PROTOBUF_FOUND)
+
+IF(NOT PROTOBUF_FOUND)
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
+    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+    IF(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+    ELSE(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+    ENDIF(WIN32)
 
-IF(WIN32)
-  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
-  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
-  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
-  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
-ELSE(WIN32)
-  IF(${HOST_SYSTEM} STREQUAL "centos")
-    SET(LIB "lib64")
-  ELSE()
-    SET(LIB "lib")
-  ENDIF()
-  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
-  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
-  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
-  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF(WIN32)
-
-ExternalProject_Add(
-  protobuf
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  PREFIX          ${PROTOBUF_SOURCES_DIR}
-  UPDATE_COMMAND  ""
-  DEPENDS         zlib
-  GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-  GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
-  CONFIGURE_COMMAND
-    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-    -Dprotobuf_BUILD_TESTS=OFF
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-)
-
-LIST(APPEND external_project_dependencies protobuf)
+    ExternalProject_Add(
+        protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX          ${PROTOBUF_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        DEPENDS         zlib
+        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        CONFIGURE_COMMAND
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+            -Dprotobuf_BUILD_TESTS=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+            -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+            -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+            -DCMAKE_BUILD_TYPE=Release
+            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_INSTALL_LIBDIR=lib
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            -DCMAKE_VERBOSE_MAKEFILE:BOOL=OFF
+            -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+            -DZLIB_ROOT:STRING=${ZLIB_ROOT}
+    )
+
+    LIST(APPEND external_project_dependencies protobuf)
+ENDIF(NOT PROTOBUF_FOUND)
+
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index e4c570479f682e951413017b256a8e16dfce625b..f4d0daab06c9fcf17f4af59c25f62b415074a52f 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -1,11 +1,11 @@
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,192 +13,217 @@
 # limitations under the License.
 
 INCLUDE(ExternalProject)
+INCLUDE(python_module)
+
+FIND_PACKAGE(PythonInterp 2.7)
+IF(WITH_PYTHON)
+    FIND_PACKAGE(PythonLibs 2.7)
+ENDIF(WITH_PYTHON)
+
+SET(py_env "")
+SET(USE_VIRTUALENV_FOR_TEST 1)
+IF(PYTHONINTERP_FOUND)
+    find_python_module(pip REQUIRED)
+    find_python_module(numpy REQUIRED)
+    find_python_module(wheel REQUIRED)
+    find_python_module(google.protobuf REQUIRED)
+    FIND_PACKAGE(NumPy REQUIRED)
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+        "please use pip to upgrade protobuf. pip install -U protobuf")
+    ENDIF()
+ELSE(PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
+    ##################################### PYTHON ########################################
+    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
+    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
+    SET(_python_DIR ${PYTHON_INSTALL_DIR})
+
+    IF(UNIX)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSEIF(WIN32)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown system !")
+    ENDIF()
 
-
-##################################### PYTHON ########################################
-SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
-SET(PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/python)
-SET(_python_DIR ${PYTHON_INSTALL_DIR})
-
-IF(UNIX)
-    SET(PYTHON_FOUND ON)
-    SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
-    SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
-    SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
-    SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
-ELSEIF(WIN32)
-    SET(PYTHON_FOUND ON)
-    SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
-    SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
-    SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
-    SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
-ELSE()
-    MESSAGE(FATAL_ERROR "Unknown system !")
-ENDIF()
-
-SET(py_env
-    PATH=${PYTHON_INSTALL_DIR}/bin/:$ENV{PATH}
-    PYTHONHOME=${PYTHON_INSTALL_DIR}
-    PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
-
-INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
-
-IF(APPLE)
-    LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
-        -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
-        )
-ENDIF()
-
-SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
-
-# Force Python build to "Release".
-IF(CMAKE_CONFIGURATION_TYPES)
-    SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
-    SET(CMAKE_CFG_INTDIR "Release")
+    IF(APPLE)
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
+            )
+    ENDIF()
+
+    SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
+
+    # Force Python build to "Release".
+    IF(CMAKE_CONFIGURATION_TYPES)
+        SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
+        SET(CMAKE_CFG_INTDIR "Release")
+    ELSE()
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            )
+    ENDIF()
+
+    ExternalProject_Add(python
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
+        PREFIX            ${PYTHON_SOURCES_DIR}
+        UPDATE_COMMAND    ""
+        CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
+        CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
+            -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
+            -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
+            -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
+            -DDOWNLOAD_SOURCES:BOOL=ON
+            -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
+        DEPENDS zlib
+    )
+
+    SET(py_env
+        PATH=${PYTHON_INSTALL_DIR}/bin
+        PYTHONHOME=${PYTHON_INSTALL_DIR}
+        PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
+    ####################################################################################
+
+    ##################################### SETUPTOOLS ###################################
+    SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
+    ExternalProject_Add(setuptools
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SETUPTOOLS_SOURCES_DIR}
+        URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python zlib
+    )
+    #####################################################################################
+
+    ##################################### SIX ###########################################
+    SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
+    ExternalProject_Add(six
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SIX_SOURCES_DIR}
+        URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python setuptools
+    )
+    #####################################################################################
+
+    ##################################### CYTHON ########################################
+    SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
+    ExternalProject_Add(cython
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX                ${CYTHON_SOURCES_DIR}
+        URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
+        GIT_TAG               0.25.2
+        BUILD_IN_SOURCE       1
+        CONFIGURE_COMMAND     ""
+        PATCH_COMMAND         ""
+        UPDATE_COMMAND        ""
+        INSTALL_COMMAND       ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python
+    )
+    ####################################################################################
+
+    ##################################### NUMPY ########################################
+    SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
+    SET(NUMPY_TAG_VERSION "v1.11.3")
+    SET(NUMPY_VERSION "1.11.3")
+
+    SET(EGG_NAME "")
+    SET(PYTHON_NUMPY_INCLUDE_DIR "")
+    IF(WIN32)
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
+    ELSE(WIN32)
+        IF(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
+        ELSE(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+        ENDIF(APPLE)
+
+        FOREACH(suffix x86_64 intel fat64 fat32 universal)
+            LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
+        ENDFOREACH()
+    ENDIF(WIN32)
+
+    ExternalProject_Add(numpy
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/numpy/numpy.git
+        GIT_TAG             ${NUMPY_TAG_VERSION}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        PREFIX              ${NUMPY_SOURCES_DIR}
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools cython
+    )
+    ####################################################################################
+
+    ##################################### WHEEL ########################################
+    SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
+    ExternalProject_Add(wheel
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
+        PREFIX              ${WHEEL_SOURCES_DIR}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        BUILD_COMMAND       ""
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools
+    )
+    ####################################################################################
+
+    ################################### PROTOBUF #######################################
+    SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
+    ExternalProject_Add(python-protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
+        URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
+        PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
+        BUILD_IN_SOURCE       1
+        PATCH_COMMAND         ""
+        CONFIGURE_COMMAND     ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python setuptools six
+    )
+    ####################################################################################
+
+    LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy)
+
+ENDIF(PYTHONINTERP_FOUND)
+
+IF(WITH_PYTHON)
+    INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+    INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
 ELSE()
-    LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
-        -DCMAKE_BUILD_TYPE:STRING=Release
-        )
+    SET(PYTHON_LIBRARIES "")
 ENDIF()
-
-ExternalProject_Add(python
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
-    PREFIX            ${PYTHON_SOURCES_DIR}
-    UPDATE_COMMAND    ""
-    CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
-    CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-    CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-    CMAKE_CACHE_ARGS
-        -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
-        -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
-        -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
-        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-        -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
-        -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
-        -DDOWNLOAD_SOURCES:BOOL=ON
-        -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
-        ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
-        ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
-    DEPENDS zlib
-)
-####################################################################################
-
-##################################### SETUPTOOLS ###################################
-SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
-ExternalProject_Add(setuptools
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX              ${SETUPTOOLS_SOURCES_DIR}
-    URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
-    BUILD_IN_SOURCE     1
-    PATCH_COMMAND       ""
-    UPDATE_COMMAND      ""
-    CONFIGURE_COMMAND   ""
-    INSTALL_COMMAND     ""
-    BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-    DEPENDS             python zlib
-)
-#####################################################################################
-
-##################################### SIX ###########################################
-SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
-ExternalProject_Add(six
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX              ${SIX_SOURCES_DIR}
-    URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
-    BUILD_IN_SOURCE     1
-    PATCH_COMMAND       ""
-    UPDATE_COMMAND      ""
-    CONFIGURE_COMMAND   ""
-    INSTALL_COMMAND     ""
-    BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-    DEPENDS             python setuptools
-)
-#####################################################################################
-
-##################################### CYTHON ########################################
-SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
-ExternalProject_Add(cython
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${CYTHON_SOURCES_DIR}
-    URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
-    GIT_TAG               0.25.2
-    BUILD_IN_SOURCE       1
-    CONFIGURE_COMMAND     ""
-    PATCH_COMMAND         ""
-    UPDATE_COMMAND        ""
-    INSTALL_COMMAND       ""
-    BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-    DEPENDS               python
-)
-####################################################################################
-
-##################################### NUMPY ########################################
-SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
-SET(NUMPY_TAG_VERSION "v1.11.3")
-SET(NUMPY_VERSION "1.11.3")
-
-SET(EGG_NAME "")
-SET(PYTHON_NUMPY_INCLUDE_DIR "")
-IF(WIN32)
-    SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
-ELSE(WIN32)
-    IF(APPLE)
-        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
-    ELSE(APPLE)
-        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
-        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
-    ENDIF(APPLE)
-
-    FOREACH(suffix x86_64 intel fat64 fat32 universal)
-        LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
-    ENDFOREACH()
-ENDIF(WIN32)
-
-INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-
-ExternalProject_Add(numpy
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY      https://github.com/numpy/numpy.git
-    GIT_TAG             ${NUMPY_TAG_VERSION}
-    CONFIGURE_COMMAND   ""
-    UPDATE_COMMAND      ""
-    PREFIX              ${NUMPY_SOURCES_DIR}
-    BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
-    INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-    BUILD_IN_SOURCE     1
-    DEPENDS             python setuptools cython
-)
-####################################################################################
-
-##################################### WHEEL ########################################
-SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
-ExternalProject_Add(wheel
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
-    PREFIX              ${WHEEL_SOURCES_DIR}
-    CONFIGURE_COMMAND   ""
-    UPDATE_COMMAND      ""
-    BUILD_COMMAND       ""
-    INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-    BUILD_IN_SOURCE     1
-    DEPENDS             python setuptools
-)
-####################################################################################
-
-################################### PROTOBUF #######################################
-SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
-ExternalProject_Add(python-protobuf
-    ${EXTERNAL_PROJECT_LOG_ARGS}
-    URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
-    URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
-    PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
-    BUILD_IN_SOURCE       1
-    PATCH_COMMAND         ""
-    CONFIGURE_COMMAND     ""
-    BUILD_COMMAND         env PATH=${PROTOBUF_INSTALL_DIR}/bin:$ENV{PATH} ${py_env} ${PYTHON_EXECUTABLE} setup.py build
-    INSTALL_COMMAND       env PATH=${PROTOBUF_INSTALL_DIR}/bin:$ENV{PATH} ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-    DEPENDS               python setuptools six
-)
-
-LIST(APPEND external_project_dependencies python setuptools six cython numpy wheel python-protobuf)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
index 40088c65ef7166ddef52956a1a7470ccab8087c9..744c766ee7b067058b2cb4aa7f7b761cbb9778d4 100644
--- a/cmake/external/swig.cmake
+++ b/cmake/external/swig.cmake
@@ -18,8 +18,8 @@ IF(NOT SWIG_FOUND)
     # build swig as an external project
     INCLUDE(ExternalProject)
 
-    SET(SWIG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/swig)
-    SET(SWIG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/swig)
+    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
+    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
     SET(SWIG_TARGET_VERSION "3.0.2")
     SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
     SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
@@ -38,14 +38,6 @@ IF(NOT SWIG_FOUND)
         SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
         SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
     ELSE(WIN32)
-        # From PCRE configure
-        ExternalProject_Add(pcre
-            ${EXTERNAL_PROJECT_LOG_ARGS}
-            GIT_REPOSITORY https://github.com/svn2github/pcre.git
-            PREFIX ${SWIG_SOURCES_DIR}/pcre
-            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
-        )
-
         # swig uses bison find it by cmake and pass it down
         FIND_PACKAGE(BISON)
 
@@ -54,16 +46,11 @@ IF(NOT SWIG_FOUND)
             GIT_REPOSITORY      https://github.com/swig/swig.git
             GIT_TAG             rel-3.0.10
             PREFIX              ${SWIG_SOURCES_DIR}
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
-            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
-            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
-            ./configure
-                --prefix=${SWIG_INSTALL_DIR}
-                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
-            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
-            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
-            UPDATE_COMMAND  ""
-            DEPENDS pcre
+            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
+                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
+            BUILD_COMMAND       cd <SOURCE_DIR> && make
+            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
+            UPDATE_COMMAND      ""
         )
 
         SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index d90768b6f1576e6d469d91d694ae0b9d1c7e8384..293070c3cfcc1196001f64469f3254289b0de792 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -14,8 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-SET(WARPCTC_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/warpctc)
-SET(WARPCTC_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/warpctc)
+SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
 
 INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
@@ -50,9 +50,19 @@ ExternalProject_Add(
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
     CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
     CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
     CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+    CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=ON
+    CMAKE_ARGS      -DBUILD_SHARED=ON
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_BUILD_TYPE:STRING=Release
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR}
 )
 
 LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 916f6816aae9938aad95ac527cf07ffbe38f7479..45ca5542b7dc30216b45487782f849b93c5f8fca 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -14,15 +14,15 @@
 
 INCLUDE(ExternalProject)
 
-SET(ZLIB_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
-SET(ZLIB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/zlib)
+SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
+SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
 
 IF(WIN32)
   SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
 ELSE(WIN32)
-  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
 ENDIF(WIN32)
 
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
@@ -34,10 +34,18 @@ ExternalProject_Add(
     GIT_TAG         "v1.2.8"
     PREFIX          ${ZLIB_SOURCES_DIR}
     UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
+    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
     CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
     CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
     CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ZLIB_INSTALL_DIR}
+                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
+                     -DCMAKE_BUILD_TYPE:STRING=Release
 )
 
 LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0983d83b73a32d0615170155759d45001cc6ff54..7a996dea92b13bdac054a987a004a3d54ff02da2 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -2,12 +2,7 @@
 include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
-
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-        FORCE)
-endif()
+include(CheckTypeSize)
 
 function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
@@ -31,7 +26,7 @@ function(CheckCompilerCXX11Flag)
 endfunction()
 
 CheckCompilerCXX11Flag()
-LIST(APPEND CMAKE_CXX_FLAGS -std=c++11)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
 
 # safe_set_flag
 #
@@ -89,6 +84,17 @@ if(NOT UINT64_MAX_EXISTS)
   endif()
 endif()
 
+SET(CMAKE_EXTRA_INCLUDE_FILES "pthread.h")
+CHECK_TYPE_SIZE(pthread_spinlock_t SPINLOCK_FOUND)
+CHECK_TYPE_SIZE(pthread_barrier_t BARRIER_FOUND)
+if(SPINLOCK_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_SPINLOCK)
+endif(SPINLOCK_FOUND)
+if(BARRIER_FOUND)
+  add_definitions(-DPADDLE_USE_PTHREAD_BARRIER)
+endif(BARRIER_FOUND)
+SET(CMAKE_EXTRA_INCLUDE_FILES "")
+
 # Common flags. the compiler flag used for C/C++ sources whenever release or debug
 # Do not care if this flag is support for gcc.
 set(COMMON_FLAGS
@@ -102,6 +108,7 @@ set(COMMON_FLAGS
     -Wno-unused-parameter
     -Wno-unused-function
     -Wno-error=literal-suffix
+    -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs)
 
 set(GPU_COMMON_FLAGS
@@ -111,6 +118,7 @@ set(GPU_COMMON_FLAGS
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
     -Wno-unused-function
+    -Wno-error=sign-compare
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
@@ -189,3 +197,4 @@ if(CUDA_ARCH)
 endif()
 
 set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+
diff --git a/cmake/make_resource.py b/cmake/make_resource.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9241b0e3e36c2e79c79e46b4f9114b7f6947341
--- /dev/null
+++ b/cmake/make_resource.py
@@ -0,0 +1,11 @@
+import os
+import re
+import sys
+
+res = sys.argv[1]
+out = sys.argv[2]
+var = re.sub(r'[ .-]', '_', os.path.basename(res))
+
+open(out, "w").write("const unsigned char " + var + "[] = {" + ",".join([
+    "0x%02x" % ord(c) for c in open(res).read()
+]) + ",0};\n" + "const unsigned " + var + "_size = sizeof(" + var + ");\n")
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 211593f358eb34cf1a5692697247511893dfeb93..ff49a2d08e8f6004320acfce266339aa301eb9c4 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -1,5 +1,4 @@
 set(CPACK_PACKAGE_NAME paddle)
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "")
 set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
 set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
 set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
@@ -10,8 +9,9 @@ set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
 set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
 set(CPACK_PACKAGE_DESCRIPTION "")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "libatlas3-base, libgflags2, libgoogle-glog0, libprotobuf8, libpython2.7, libstdc++6, python-numpy, python-pip, python-pip-whl, python-protobuf")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
+set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
diff --git a/cmake/python_module.cmake b/cmake/python_module.cmake
index 2eb3441428e8290b665e092f6e4b40e146ea5a52..1412b7f7f20600acf95a4a899f5e6529c3b67a35 100644
--- a/cmake/python_module.cmake
+++ b/cmake/python_module.cmake
@@ -26,5 +26,18 @@ function(find_python_module module)
     if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
         message(FATAL_ERROR "python module ${module} is not found")
     endif()
+
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+        OUTPUT_VARIABLE _${module}_version
+        RESULT_VARIABLE _${module}_status
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _${module}_status)
+        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+            "Version of Python module ${module}")
+    endif(NOT _${module}_status)
+
     set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
 endfunction(find_python_module)
diff --git a/cmake/simd.cmake b/cmake/simd.cmake
index d380c996dfa95f0caa2b9cd9daa0ac9141e51fe0..46035a908b588861607a25d3a21cf34b7b6fd4b8 100644
--- a/cmake/simd.cmake
+++ b/cmake/simd.cmake
@@ -2,6 +2,7 @@
 # so that PaddlePaddle can unleash the vectorization power of muticore.
 
 INCLUDE(CheckCXXSourceRuns)
+INCLUDE(CheckCXXSourceCompiles)
 
 IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
     set(MMX_FLAG "-mmmx")
@@ -17,6 +18,8 @@ ELSEIF(MSVC)
     SET(AVX2_FLAG "/arch:AVX2")
 ENDIF()
 
+set(CMAKE_REQUIRED_FLAGS_RETAINED ${CMAKE_REQUIRED_FLAGS})
+
 # Check  MMX
 set(CMAKE_REQUIRED_FLAGS ${MMX_FLAG})
 CHECK_CXX_SOURCE_RUNS("
@@ -73,4 +76,5 @@ int main()
     return 0;
 }" AVX2_FOUND)
 
+set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
 mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 788db404ebfb6facbaedf2910186f3b1afe775c1..75a9d8fc25674e1dd0f5b73cd0ccde48204f63aa 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -12,6 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
+# but the name like centos is necessary in some scenes
+# to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+
 IF(WIN32)
     SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
@@ -20,7 +28,13 @@ ELSE(WIN32)
         STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
         SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
+        IF(NOT DEFINED ENV{MACOSX_DEPLOYMENT_TARGET})
+            # Set cache variable - end user may change this during ccmake or cmake-gui configure.
+            SET(CMAKE_OSX_DEPLOYMENT_TARGET ${MACOS_VERSION} CACHE STRING
+                "Minimum OS X version to target for deployment (at runtime); newer APIs weak linked. Set to empty string for default value.")
+        ENDIF()
     ELSE(APPLE)
+
         IF(EXISTS "/etc/issue")
             FILE(READ "/etc/issue" LINUX_ISSUE)
             IF(LINUX_ISSUE MATCHES "CentOS")
@@ -29,8 +43,24 @@ ELSE(WIN32)
                 SET(HOST_SYSTEM "debian")
             ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
                 SET(HOST_SYSTEM "ubuntu")
+            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
+                SET(HOST_SYSTEM "redhat")
+            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
+                SET(HOST_SYSTEM "fedora")
             ENDIF()
         ENDIF(EXISTS "/etc/issue")
+
+        IF(EXISTS "/etc/redhat-release")
+            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ENDIF()
+        ENDIF(EXISTS "/etc/redhat-release")
+
+        IF(NOT HOST_SYSTEM)
+            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+        ENDIF()
+
     ENDIF(APPLE)
 ENDIF(WIN32)
 
@@ -42,12 +72,18 @@ MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
 MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
 MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
 
+IF(DEFINED CMAKE_SYSTEM_NAME)
+    IF(${CMAKE_SYSTEM_NAME} STREQUAL "Android")
+        SET(ANDROID TRUE)
+    ENDIF()
+ENDIF()
+
 # external dependencies log output
 SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
     LOG_UPDATE      1     # Wrap update in script to log output
     LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
     LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
diff --git a/cmake/util.cmake b/cmake/util.cmake
index a19bf2a7998ed7772a66f6a7eb5f9e858b0e75a2..b828eef322bc570c07f5c357353641117a094c16 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -71,21 +71,10 @@ function(link_paddle_exe TARGET_NAME)
         generate_rdma_links()
     endif()
 
-    if(WITH_METRIC)
-        if(WITH_GPU)
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
-        else()
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric_cpu)
-        endif()
-    else()
-        set(METRIC_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
         paddle_function
-        ${METRIC_LIBS}
         ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
@@ -95,31 +84,16 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        ${METRIC_LIBS}
         ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CMAKE_DL_LIBS}
         ${RDMA_LD_FLAGS}
         ${RDMA_LIBS})
 
-    if(WITH_PYTHON)
-        target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES} util)
-    endif()
-
-    if(WITH_GPU)
-        if(NOT WITH_DSO OR WITH_METRIC)
-            target_link_libraries(${TARGET_NAME}
-                ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY})
-            CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
-        endif()
+    if(ANDROID)
+        target_link_libraries(${TARGET_NAME} log)
+    endif(ANDROID)
 
-        check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
-        if(HAVE_CLOCK_GETTIME)
-            target_link_libraries(${TARGET_NAME} rt)
-        endif()
-    endif()
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
@@ -164,17 +138,19 @@ macro(add_simple_unittest TARGET_NAME)
 endmacro()
 
 # Creates C resources file from files in given resource file
-function(create_resources res_file output)
-    # Create empty output file
-    file(WRITE ${output} "")
-    # Get short filename
-    string(REGEX MATCH "([^/]+)$" filename ${res_file})
-    # Replace filename spaces & extension separator for C compatibility
-    string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
-    # Read hex data from file
-    file(READ ${res_file} filedata HEX)
-    # Convert hex data for C compatibility
-    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
-    # Append data to output file
-    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}0};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+function(create_resources res_file output_file)
+  add_custom_command(
+    OUTPUT ${output_file}
+    COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file}
+    DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py)
+endfunction()
+
+
+# Create a python unittest using run_python_tests.sh,
+# which takes care of making correct running environment
+function(add_python_test TEST_NAME)
+    add_test(NAME ${TEST_NAME}
+        COMMAND bash ${PROJ_ROOT}/paddle/scripts/run_python_tests.sh
+        ${USE_VIRTUALENV_FOR_TEST} ${PYTHON_EXECUTABLE} ${ARGN}
+        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 endfunction()
diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d20540780becf504973a23b50445d4b65dc2ef
--- /dev/null
+++ b/demo/image_classification/api_v2_resnet.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['resnet_cifar10']
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+
+
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+
+
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+
+
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+
+
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232
--- /dev/null
+++ b/demo/image_classification/api_v2_train.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import sys
+
+import paddle.v2 as paddle
+
+from api_v2_vgg import vgg_bn_drop
+
+
+def main():
+    datadim = 3 * 32 * 32
+    classdim = 10
+
+    # PaddlePaddle init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+
+    # Add neural network config
+    # option 1. resnet
+    # net = resnet_cifar10(image, depth=32)
+    # option 2. vgg
+    net = vgg_bn_drop(image)
+
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    momentum_optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+        learning_rate=0.1 / 128.0,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=50000 * 100,
+        learning_rate_schedule='discexp',
+        batch_size=128)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.cifar.test10(), batch_size=128),
+                feeding={'image': 0,
+                         'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=momentum_optimizer)
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10(), buf_size=50000),
+            batch_size=128),
+        num_passes=5,
+        event_handler=event_handler,
+        feeding={'image': 0,
+                 'label': 1})
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0e6b93adde30425f17aa9cd07542275f4fec37
--- /dev/null
+++ b/demo/image_classification/api_v2_vgg.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['vgg_bn_drop']
+
+
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=ipt,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=paddle.pooling.Max())
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+    bn = paddle.layer.batch_norm(
+        input=fc1,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+    return fc2
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 9a86aafcb2fa4d4354d1dd9443c1b73ddcda980b..49c0ff600c40e0222093ff0a8a2f7e8e38ccba29 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -126,7 +126,7 @@ class ImageClassifier():
         # For oversampling, average predictions across crops.
         # If not, the shape of output[name]: (1, class_number),
         # the mean is also applicable.
-        return output[output_layer].mean(0)
+        return output[output_layer]['value'].mean(0)
 
     def predict(self, image=None, output_layer=None):
         assert isinstance(image, basestring)
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index 6fc11caf1c75192242482c2e85f8167eb9fba4ec..e45bd47ad5925c6674d628a70a7ad7c4d5d5c173 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba971b3688ce3dec078998df2c0b183a4e449f8
--- /dev/null
+++ b/demo/introduction/api_train_v2.py
@@ -0,0 +1,58 @@
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index b7bbb90ddd287e3e312a490b53924ae76fb20d2c..2ce6446d7c943ffc9bea8da43d153539f6f9f15f 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
     --save_dir=./output \
     --num_passes=30 \
     2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index ecafe955f9e5c1062168d5d7b6b4c639d6e72a99..651dfaa4b7b4873810a0b393655541a62d1a311b 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -34,5 +34,5 @@ y_predict = fc_layer(
     size=1,
     act=LinearActivation(),
     bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
+cost = mse_cost(input=y_predict, label=y)
 outputs(cost)
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
index 8bd9837523ccf98e6e72d5b82934b7b104816217..7e61d5e3a0cabd46d4185454d46610ac2ee2e63f 100644
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -5,3 +5,6 @@ plot.png
 train.log
 *pyc
 .ipynb_checkpoints
+params.pkl
+params.tar
+params.tar.gz
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
index f301da382ff8a5bc16d9c18b956f78566ed4894f..ea1caa7dd9653a2cc2860ace736fe3d25a3767e0 100644
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@@ -6,33 +6,15 @@ passed to C++ side of Paddle.
 
 The user api could be simpler and carefully designed.
 """
-import py_paddle.swig_paddle as api
-from py_paddle import DataProviderConverter
-import paddle.trainer.PyDataProvider2 as dp
-import numpy as np
 import random
-from mnist_util import read_from_mnist
-from paddle.trainer_config_helpers import *
-
-
-def optimizer_config():
-    settings(
-        learning_rate=1e-4,
-        learning_method=AdamOptimizer(),
-        batch_size=1000,
-        model_average=ModelAverage(average_window=0.5),
-        regularization=L2Regularization(rate=0.5))
 
+import numpy as np
+import paddle.v2 as paddle_v2
+import py_paddle.swig_paddle as api
+from paddle.trainer_config_helpers import *
+from py_paddle import DataProviderConverter
 
-def network_config():
-    imgs = data_layer(name='pixel', size=784)
-    hidden1 = fc_layer(input=imgs, size=200)
-    hidden2 = fc_layer(input=hidden1, size=200)
-    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-    cost = classification_cost(
-        input=inference, label=data_layer(
-            name='label', size=10))
-    outputs(cost)
+from mnist_util import read_from_mnist
 
 
 def init_parameter(network):
@@ -75,19 +57,35 @@ def input_order_converter(generator):
 def main():
     api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
 
-    # get enable_types for each optimizer.
-    # enable_types = [value, gradient, momentum, etc]
-    # For each optimizer(SGD, Adam), GradientMachine should enable different
-    # buffers.
-    opt_config_proto = parse_optimizer_config(optimizer_config)
-    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
-    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
-    enable_types = _temp_optimizer_.getParameterTypes()
+    optimizer = paddle_v2.optimizer.Adam(
+        learning_rate=1e-4,
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = optimizer.create_local_updater()
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # define network
+    images = paddle_v2.layer.data(
+        name='pixel', type=paddle_v2.data_type.dense_vector(784))
+    label = paddle_v2.layer.data(
+        name='label', type=paddle_v2.data_type.integer_value(10))
+    hidden1 = paddle_v2.layer.fc(input=images, size=200)
+    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
+    inference = paddle_v2.layer.fc(input=hidden2,
+                                   size=10,
+                                   act=paddle_v2.activation.Softmax())
+    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
 
     # Create Simple Gradient Machine.
-    model_config = parse_network_config(network_config)
-    m = api.GradientMachine.createFromConfigProto(
-        model_config, api.CREATE_MODE_NORMAL, enable_types)
+    model_config = paddle_v2.layer.parse_network(cost)
+    m = api.GradientMachine.createFromConfigProto(model_config,
+                                                  api.CREATE_MODE_NORMAL,
+                                                  optimizer.enable_types())
 
     # This type check is not useful. Only enable type hint in IDE.
     # Such as PyCharm
@@ -96,19 +94,12 @@ def main():
     # Initialize Parameter by numpy.
     init_parameter(network=m)
 
-    # Create Local Updater. Local means not run in cluster.
-    # For a cluster training, here we can change to createRemoteUpdater
-    # in future.
-    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
-    assert isinstance(updater, api.ParameterUpdater)
-
     # Initialize ParameterUpdater.
     updater.init(m)
 
     # DataProvider Converter is a utility convert Python Object to Paddle C++
     # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
-        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+    converter = DataProviderConverter(input_types=[images.type, label.type])
 
     train_file = './data/raw_data/train'
     test_file = './data/raw_data/t10k'
diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b95a88042a13a280bcb80f753b3887fcef37296
--- /dev/null
+++ b/demo/mnist/api_train_v2.py
@@ -0,0 +1,137 @@
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+
+    try:
+        with gzip.open('params.tar.gz', 'r') as f:
+            parameters = paddle.parameters.Parameters.from_tar(f)
+    except IOError:
+        parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+                with gzip.open('params.tar.gz', 'w') as f:
+                    parameters.to_tar(f)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
index da90cd749a02976633d0f0d6e4352d8a85c7cdef..ca2b1ad9eb960685b95b0f294a9b929e1a4acab1 100755
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=100 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index 4631816c43ef48839df1863a0a86c3ab00924d3f..6074cc1d3a85e13e3e8d336d81e22104f9d8e7cf 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -156,7 +156,7 @@ class ImageClassifier():
             # For oversampling, average predictions across crops.
             # If not, the shape of output[name]: (1, class_number),
             # the mean is also applicable.
-            res[name] = output[name].mean(0)
+            res[name] = output[name]['value'].mean(0)
 
         return res
 
diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
index aac9b89b14b98ac8e2db7def19e5f06c01682493..a7b1f01064b29cf6abc4cd6b706ee466a6d6da36 100755
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
 pushd "$home_dir"
 cfg=trainer_config.lr.py
 paddle train \
+  --start_pserver=false \
   --config=$cfg \
   --save_dir=${model_dir} \
   --trainer_count=4 \
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
index f02e5038e92790c7f1ddcd84a09c6d9a02f84ac4..e47c2dd01fb5c919203964e298018e6dc2bd366e 100755
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
     --init_model_path=$model \
     --config_args=is_predict=1 \
     --predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
 
 mv rank-00000 result.txt
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index e3595fce7519297058e1eeb66487692267ddcfcc..01697fed48054be8ad98a01d4cbb5029e6a1ead0 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
   --show_parameter_stats_period=100 \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a061799e3ac50236a68beedaf700dd6c698a05
--- /dev/null
+++ b/demo/recommendation/api_train_v2.py
@@ -0,0 +1,125 @@
+import paddle.v2 as paddle
+import cPickle
+import copy
+
+
+def main():
+    paddle.init(use_gpu=False)
+    movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+    uid = paddle.layer.data(
+        name='user_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_user_id() + 1))
+    usr_emb = paddle.layer.embedding(input=uid, size=32)
+
+    usr_gender_id = paddle.layer.data(
+        name='gender_id', type=paddle.data_type.integer_value(2))
+    usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+
+    usr_age_id = paddle.layer.data(
+        name='age_id',
+        type=paddle.data_type.integer_value(
+            len(paddle.dataset.movielens.age_table)))
+    usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+
+    usr_job_id = paddle.layer.data(
+        name='job_id',
+        type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(
+        ) + 1))
+
+    usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+
+    usr_combined_features = paddle.layer.fc(
+        input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],
+        size=200,
+        act=paddle.activation.Tanh())
+
+    mov_id = paddle.layer.data(
+        name='movie_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_movie_id() + 1))
+    mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+
+    mov_categories = paddle.layer.data(
+        name='category_id',
+        type=paddle.data_type.sparse_binary_vector(
+            len(paddle.dataset.movielens.movie_categories())))
+
+    mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+    mov_title_id = paddle.layer.data(
+        name='movie_title',
+        type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+    mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+    mov_title_conv = paddle.networks.sequence_conv_pool(
+        input=mov_title_emb, hidden_size=32, context_len=3)
+
+    mov_combined_features = paddle.layer.fc(
+        input=[mov_emb, mov_categories_hidden, mov_title_conv],
+        size=200,
+        act=paddle.activation.Tanh())
+
+    inference = paddle.layer.cos_sim(
+        a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+    cost = paddle.layer.mse_cost(
+        input=inference,
+        label=paddle.layer.data(
+            name='score', type=paddle.data_type.dense_vector(1)))
+
+    parameters = paddle.parameters.create(cost)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=paddle.optimizer.Adam(
+                                     learning_rate=1e-4))
+    feeding = {
+        'user_id': 0,
+        'gender_id': 1,
+        'age_id': 2,
+        'job_id': 3,
+        'movie_id': 4,
+        'category_id': 5,
+        'movie_title': 6,
+        'score': 7
+    }
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d Batch %d Cost %.2f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.movielens.train(), buf_size=8192),
+            batch_size=256),
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=1)
+
+    user_id = 234
+    movie_id = 345
+
+    user = paddle.dataset.movielens.user_info()[user_id]
+    movie = paddle.dataset.movielens.movie_info()[movie_id]
+
+    feature = user.value() + movie.value()
+
+    def reader():
+        yield feature
+
+    infer_dict = copy.copy(feeding)
+    del infer_dict['score']
+
+    prediction = paddle.infer(
+        output=inference,
+        parameters=parameters,
+        reader=paddle.batch(
+            reader, batch_size=32),
+        feeding=infer_dict)
+    print(prediction + 5) / 2
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
index e341d1cc7a3267bef9db916719b2e4b1981e31bc..22aef556082ba429e9ca7c6dd3ec72699b9dbcf4 100755
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
     --log_period=100 \
     --dot_period=1 \
     --num_passes=50  2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index aabcd335253faf69c940024ac8098a54da030463..25f529d7d7c430f179107fb189ade34760ab309d 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -86,10 +86,7 @@ movie_feature = construct_feature("movie")
 user_feature = construct_feature("user")
 similarity = cos_sim(a=movie_feature, b=user_feature)
 if not is_predict:
-    outputs(
-        regression_cost(
-            input=similarity, label=data_layer(
-                'rating', size=1)))
+    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
 
     define_py_data_sources2(
         'data/train.list',
diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..036cad4b0a32357bb42580ef577a1eba558be8fe
--- /dev/null
+++ b/demo/semantic_role_labeling/api_train_v2.py
@@ -0,0 +1,190 @@
+import sys
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+
+
+def db_lstm():
+    word_dict, verb_dict, label_dict = conll05.get_dict()
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+    pred_len = len(verb_dict)
+
+    mark_dict_len = 2
+    word_dim = 32
+    mark_dim = 5
+    hidden_dim = 512
+    depth = 8
+
+    #8 features
+    def d_type(size):
+        return paddle.data_type.integer_value_sequence(size)
+
+    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+
+    default_std = 1 / math.sqrt(hidden_dim) / 3.0
+
+    emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
+    std_0 = paddle.attr.Param(initial_std=0.)
+    std_default = paddle.attr.Param(initial_std=default_std)
+
+    predicate_embedding = paddle.layer.embedding(
+        size=word_dim,
+        input=predicate,
+        param_attr=paddle.attr.Param(
+            name='vemb', initial_std=default_std))
+    mark_embedding = paddle.layer.embedding(
+        size=mark_dim, input=mark, param_attr=std_0)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        paddle.layer.embedding(
+            size=word_dim, input=x, param_attr=emb_para) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0 = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=emb, param_attr=std_default) for emb in emb_layers
+        ])
+
+    mix_hidden_lr = 1e-3
+    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=default_std, learning_rate=mix_hidden_lr)
+
+    lstm_0 = paddle.layer.lstmemory(
+        input=hidden_0,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    #stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = paddle.layer.mixed(
+            size=hidden_dim,
+            bias_attr=std_default,
+            input=[
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[0], param_attr=hidden_para_attr),
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[1], param_attr=lstm_para_attr)
+            ])
+
+        lstm = paddle.layer.lstmemory(
+            input=mix_hidden,
+            act=paddle.activation.Relu(),
+            gate_act=paddle.activation.Sigmoid(),
+            state_act=paddle.activation.Sigmoid(),
+            reverse=((i % 2) == 1),
+            bias_attr=std_0,
+            param_attr=lstm_para_attr)
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = paddle.layer.mixed(
+        size=label_dict_len,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ], )
+
+    crf_cost = paddle.layer.crf(size=label_dict_len,
+                                input=feature_out,
+                                label=target,
+                                param_attr=paddle.attr.Param(
+                                    name='crfw',
+                                    initial_std=default_std,
+                                    learning_rate=mix_hidden_lr))
+
+    crf_dec = paddle.layer.crf_decoding(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=paddle.attr.Param(name='crfw'))
+
+    return crf_cost, crf_dec
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    crf_cost, crf_dec = db_lstm()
+
+    # create parameters
+    parameters = paddle.parameters.create([crf_cost, crf_dec])
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-2,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    trainer = paddle.trainer.SGD(cost=crf_cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+
+    trn_reader = paddle.batch(
+        paddle.reader.shuffle(
+            conll05.test(), buf_size=8192), batch_size=10)
+
+    feeding = {
+        'word_data': 0,
+        'ctx_n2_data': 1,
+        'ctx_n1_data': 2,
+        'ctx_0_data': 3,
+        'ctx_p1_data': 4,
+        'ctx_p2_data': 5,
+        'verb_data': 6,
+        'mark_data': 7,
+        'target': 8
+    }
+
+    trainer.train(
+        reader=trn_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        feeding=feeding)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 11d9d6a19c1b17ad1b7540ee7a03017f85dd821e..095bbff2ea42627a13d8ebab436f5a05abc09743 100755
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
   --config_args=is_test=1 \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index 9354e72f46dc4dfc46138a04c330933d404c6cb8..eee14010d7b04a1b824f39090fa82fc532085e0d 100755
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
   --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
index 00f72cecacb454a0dd1184fa2098be4543007de7..4b7f5d0e504aef3884a04cbed8c16503a4079772 100755
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -32,4 +32,6 @@ def process(settings, file_name):
             word_slot = [
                 settings.word_dict[w] for w in words if w in settings.word_dict
             ]
+            if not word_slot:
+                continue
             yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index 8ec490f64691924013200a3d0038d39aa834b038..64c78e0d6b9297e7a321a4f070517593b0bfe332 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -138,7 +138,11 @@ def main():
 
     batch = []
     for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
+        if words:
+            batch.append([words])
+        else:
+            print('All the words in [%s] are not in the dictionary.' % line)
         if len(batch) == batch_size:
             predict.batch_predict(batch)
             batch = []
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 8af827c3388c8df88a872bd87d121a4f9631c3ff..85c4f3ccfc3ede23fcf701769b9701ecbf57c789 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
              --trainer_count=4 \
              --config_args=is_test=1 \
              2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
index 5ce8bf4b997d962b9b61593cec0954d76c4874bc..14620f733bf03444e5ba3b3b792dfbed6146ecde 100755
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
              --show_parameter_stats_period=100 \
              --test_all_data_in_one_period=1 \
              2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c856556bd0cb32f60eba322469b3621c37e1349
--- /dev/null
+++ b/demo/sentiment/train_v2.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.v2 as paddle
+
+
+def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    """
+    assert stacked_num % 2 == 1
+
+    layer_attr = paddle.attr.Extra(drop_rate=0.5)
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(
+        input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(
+        input=inputs[1], pooling_type=paddle.pooling.Max())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+if __name__ == '__main__':
+    # init
+    paddle.init(use_gpu=False)
+
+    #data
+    print 'load dictionary...'
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=100)
+    test_reader = paddle.batch(
+        lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+
+    feeding = {'word': 0, 'label': 1}
+
+    # network config
+    # Please choose the way to build the network
+    # by uncommenting the corresponding line.
+    cost = convolution_net(dict_dim, class_dim=class_dim)
+    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=2e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=2)
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 2defecd178262900c03c1eda60b351dc44629d1f..f1cadaa728ac58107e15f77b5994d31da088caf7 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -29,7 +29,7 @@ settings(
     batch_size=128,
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     regularization=L2Regularization(8e-4),
     gradient_clipping_threshold=25)
 
diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3072c375123a2713c655b09fb28001960c9ab64d
--- /dev/null
+++ b/demo/seqToseq/api_train_v2.py
@@ -0,0 +1,214 @@
+import sys
+
+import paddle.v2 as paddle
+
+
+def seqToseq_net(source_dict_dim, target_dict_dim, is_generating=False):
+    ### Network Architecture
+    word_vector_dim = 512  # dimension of word vector
+    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
+    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
+
+    beam_size = 3
+    max_length = 250
+
+    #### Encoder
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    #### Decoder
+    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
+        encoded_proj += paddle.layer.full_matrix_projection(
+            input=encoded_vector)
+
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    with paddle.layer.mixed(
+            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
+        decoder_boot += paddle.layer.full_matrix_projection(
+            input=backward_first)
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        with paddle.layer.mixed(
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+
+    if not is_generating:
+        trg_embedding = paddle.layer.embedding(
+            input=paddle.layer.data(
+                name='target_language_word',
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
+
+        # For decoder equipped with attention mechanism, in training,
+        # target embeding (the groudtruth) is the data input,
+        # while encoded source sequence is accessed to as an unbounded memory.
+        # Here, the StaticInput defines a read-only memory
+        # for the recurrent_group.
+        decoder = paddle.layer.recurrent_group(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs)
+
+        lbl = paddle.layer.data(
+            name='target_language_next_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim))
+        cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+        return cost
+    else:
+        # In generation, the decoder predicts a next target word based on
+        # the encoded source sequence and the last generated target word.
+
+        # The encoded source sequence (encoder's output) must be specified by
+        # StaticInput, which is a read-only memory.
+        # Embedding of the last generated word is automatically gotten by
+        # GeneratedInputs, which is initialized by a start mark, such as <s>,
+        # and must be included in generation.
+
+        trg_embedding = paddle.layer.GeneratedInputV2(
+            size=target_dict_dim,
+            embedding_name='_target_language_embedding',
+            embedding_size=word_vector_dim)
+        group_inputs.append(trg_embedding)
+
+        beam_gen = paddle.layer.beam_search(
+            name=decoder_group_name,
+            step=gru_decoder_with_attention,
+            input=group_inputs,
+            bos_id=0,
+            eos_id=1,
+            beam_size=beam_size,
+            max_length=max_length)
+
+        return beam_gen
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    is_generating = False
+
+    # source and target dict dim.
+    dict_size = 30000
+    source_dict_dim = target_dict_dim = dict_size
+
+    # train the network
+    if not is_generating:
+        cost = seqToseq_net(source_dict_dim, target_dict_dim)
+        parameters = paddle.parameters.create(cost)
+
+        # define optimize method and trainer
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=5e-5,
+            regularization=paddle.optimizer.L2Regularization(rate=8e-4))
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+        # define data reader
+        wmt14_reader = paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.wmt14.train(dict_size), buf_size=8192),
+            batch_size=5)
+
+        # define event_handler callback
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 10 == 0:
+                    print "\nPass %d, Batch %d, Cost %f, %s" % (
+                        event.pass_id, event.batch_id, event.cost,
+                        event.metrics)
+                else:
+                    sys.stdout.write('.')
+                    sys.stdout.flush()
+
+        # start to train
+        trainer.train(
+            reader=wmt14_reader, event_handler=event_handler, num_passes=2)
+
+    # generate a english sequence to french
+    else:
+        # use the first 3 samples for generation
+        gen_creator = paddle.dataset.wmt14.gen(dict_size)
+        gen_data = []
+        gen_num = 3
+        for item in gen_creator():
+            gen_data.append((item[0], ))
+            if len(gen_data) == gen_num:
+                break
+
+        beam_gen = seqToseq_net(source_dict_dim, target_dict_dim, is_generating)
+        # get the pretrained model, whose bleu = 26.92
+        parameters = paddle.dataset.wmt14.model()
+        # prob is the prediction probabilities, and id is the prediction word. 
+        beam_result = paddle.infer(
+            output_layer=beam_gen,
+            parameters=parameters,
+            input=gen_data,
+            field=['prob', 'id'])
+
+        # get the dictionary
+        src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
+
+        # the delimited element of generated sequences is -1,
+        # the first element of each generated sequence is the sequence length
+        seq_list = []
+        seq = []
+        for w in beam_result[1]:
+            if w != -1:
+                seq.append(w)
+            else:
+                seq_list.append(' '.join([trg_dict.get(w) for w in seq[1:]]))
+                seq = []
+
+        prob = beam_result[0]
+        beam_size = 3
+        for i in xrange(gen_num):
+            print "\n*******************************************************\n"
+            print "src:", ' '.join(
+                [src_dict.get(w) for w in gen_data[i][0]]), "\n"
+            for j in xrange(beam_size):
+                print "prob = %f:" % (prob[i][j]), seq_list[i * beam_size + j]
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
index 33a42f6eff2b0414c466d5f78c89989a6a517eb9..9bb6dbdb1d4c5e35bfb31855e0331f0250a69a20 100755
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
     --log_period=10 \
     --dot_period=5 \
     2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index e523a34d5a95120d1f0a583be8bbdbff5678d1ab..3d1f86ec3b7eda4fceaf3a1e406e3d0a1a4a2f60 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
                         encoder_size=512,
                         decoder_size=512,
                         beam_size=3,
-                        max_length=250):
+                        max_length=250,
+                        error_clipping=50):
     """
     A wrapper for an attention version of GRU Encoder-Decoder network
     is_generating: whether this config is used for generating
@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
         input=src_word_id,
         size=word_vector_dim,
         param_attr=ParamAttr(name='_source_language_embedding'))
-    src_forward = simple_gru(input=src_embedding, size=encoder_size)
+    src_forward = simple_gru(
+        input=src_embedding,
+        size=encoder_size,
+        naive=True,
+        gru_layer_attr=ExtraLayerAttribute(
+            error_clipping_threshold=error_clipping))
     src_backward = simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
+        input=src_embedding,
+        size=encoder_size,
+        reverse=True,
+        naive=True,
+        gru_layer_attr=ExtraLayerAttribute(
+            error_clipping_threshold=error_clipping))
     encoded_vector = concat_layer(input=[src_forward, src_backward])
 
     with mixed_layer(size=decoder_size) as encoded_proj:
@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf,
             decoder_inputs += full_matrix_projection(input=context)
             decoder_inputs += full_matrix_projection(input=current_word)
 
-        gru_step = gru_step_layer(
+        gru_step = gru_step_naive_layer(
             name='gru_decoder',
             input=decoder_inputs,
             output_mem=decoder_mem,
-            size=decoder_size)
+            size=decoder_size,
+            layer_attr=ExtraLayerAttribute(
+                error_clipping_threshold=error_clipping))
 
         with mixed_layer(
                 size=target_dict_dim, bias_attr=True,
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
index a700ae213473dfe7c5b77156de15775b8fe9a9f0..64b78f5e9654e7b206740f92e224e0164108c9f1 100755
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
     --test_pass=12 \
     --trainer_count=1 \
     2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
index bdece693e5c407c89bc172c461bac7f9b20560d3..b0ec9854b118cbb9ed39d6bed0cdd845403926a4 100755
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
 --log_period=10 \
 --dot_period=5 \
 2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
index 0624b17787aaf90732707e5f4fc6c2195b8f65ee..ea012ba1ae9c790ccefd3dd5f066aa92202128a2 100644
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
@@ -27,7 +27,7 @@ settings(
     learning_method=MomentumOptimizer(),
     batch_size=batch_size,
     regularization=L2Regularization(batch_size * 1e-4),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     learning_rate=1e-1,
     learning_rate_decay_a=1e-5,
     learning_rate_decay_b=0.25, )
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
index b9b41b2433461eb1bfb309659834661c2ae43253..937a34df103663ecf0f0827bbfb9d82823c9b902 100644
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -27,7 +27,7 @@ settings(
     learning_method=MomentumOptimizer(),
     batch_size=batch_size,
     regularization=L2Regularization(batch_size * 1e-5),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     learning_rate=2e-3,
     learning_rate_decay_a=5e-7,
     learning_rate_decay_b=0.5, )
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
index 9a706b98d8686101ba21b513644bdd791062ec26..37e196c84200dc26ccb523076a81dbc393b1280f 100755
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
        --dot_period=10 \
        --log_period=1000 \
        --test_period=0 \
-       --num_passes=10
+       --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
index 597b5afea9c63a8e209b69b6a40e74556e27ac31..ad6e2d8ee7f813c69f9dd250c6f7bbb4403a0ed5 100755
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
        --log_period=10000 \
        --test_period=0 \
        --num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh
index cec35dce11d1c146a9e878ebab81abe904d6136c..2dbd5e8805dd97d35c7d58917f8ec6b5033bda03 100755
--- a/demo/traffic_prediction/predict.sh
+++ b/demo/traffic_prediction/predict.sh
@@ -25,6 +25,6 @@ paddle train \
     --config_args=is_predict=1 \
     --predict_output_dir=. 
 
-python gen_result.py > result.txt
+python gen_result.py > result.csv
 
 rm -rf rank-00000
diff --git a/demo/word2vec/api_train_v2.py b/demo/word2vec/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0940f0e56eafa22f8aeb7052c0ddc79d8862917
--- /dev/null
+++ b/demo/word2vec/api_train_v2.py
@@ -0,0 +1,100 @@
+import gzip
+import math
+
+import paddle.v2 as paddle
+
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.embedding(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0,
+            sparse_update=True))
+    return wordemb
+
+
+def main():
+    # for local training
+    cluster_train = False
+
+    if not cluster_train:
+        paddle.init(use_gpu=False, trainer_count=1)
+    else:
+        paddle.init(
+            use_gpu=False,
+            trainer_count=2,
+            port=7164,
+            ports_num=1,
+            ports_num_for_sparse=1,
+            num_gradient_servers=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                with gzip.open("batch-" + str(event.batch_id) + ".tar.gz",
+                               'w') as f:
+                    trainer.save_parameter_to_tar(f)
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+
+    parameters = paddle.parameters.create(cost)
+    adagrad = paddle.optimizer.AdaGrad(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost,
+                                 parameters,
+                                 adagrad,
+                                 is_local=not cluster_train)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..9be0b370ee5e301aee4a6e31b1cfa905754968e8 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -1,37 +1,9 @@
-API中文手册
-============
-
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_cn.rst
-    data_provider/pydataprovider2_cn.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
+API
+===
 
 ..  toctree::
     :maxdepth: 1
 
-    predict/swig_py_paddle_cn.rst
+    模型配置 <v2/model_configs.rst>
+    数据访问 <v2/data.rst>
+    训练与应用 <v2/run_logic.rst>
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 10c297a71d6988c002de868e804ed9ee2345fbd7..25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -1,37 +1,9 @@
 API
 ===
 
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_en.rst
-    data_provider/pydataprovider2_en.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
 ..  toctree::
     :maxdepth: 1
 
-    predict/swig_py_paddle_en.rst
+    v2/model_configs.rst
+    v2/data.rst
+    v2/run_logic.rst
diff --git a/doc/api/trainer_config_helpers/activations.rst b/doc/api/trainer_config_helpers/activations.rst
deleted file mode 100644
index 269e6491e7ebe3899c3fb24fca756a393043473b..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/activations.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-===========
-Activations
-===========
-
-BaseActivation
-==============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: BaseActivation
-    :noindex:
-    
-AbsActivation
-===============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: AbsActivation
-    :noindex:
-    
-ExpActivation
-===============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: ExpActivation
-    :noindex:
-    
-IdentityActivation
-==================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: IdentityActivation
-    :noindex:
-    
-LinearActivation
-==================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: LinearActivation
-    :noindex:
-
-LogActivation
-==================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: LogActivation
-    :noindex:
-    
-SquareActivation
-================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SquareActivation
-    :noindex:
-    
-SigmoidActivation
-=================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SigmoidActivation
-    :noindex:
-    
-SoftmaxActivation
-=================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SoftmaxActivation
-    :noindex:
-    
-SequenceSoftmaxActivation
-=========================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SequenceSoftmaxActivation
-    :noindex:
-    
-ReluActivation
-==============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: ReluActivation
-    :noindex:
-    
-BReluActivation
-===============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: BReluActivation
-    :noindex:
-    
-SoftReluActivation
-==================
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: SoftReluActivation
-    :noindex:
-    
-TanhActivation
-==============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: TanhActivation
-    :noindex:
-    
-STanhActivation
-===============
-
-..  automodule:: paddle.trainer_config_helpers.activations
-    :members: STanhActivation
-    :noindex:
diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/trainer_config_helpers/attrs.rst
deleted file mode 100644
index ac63127bf7d9db6351365ab7b58f43db12347a8e..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/attrs.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Parameter Attributes
-=======================
-
-..  automodule:: paddle.trainer_config_helpers.attrs
-    :members:
diff --git a/doc/api/trainer_config_helpers/data_sources.rst b/doc/api/trainer_config_helpers/data_sources.rst
deleted file mode 100644
index b9dd4dda01ae59d1260356aff50ddf298d02c87f..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/data_sources.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-..  _api_trainer_config_helpers_data_sources:
-
-DataSources
-===========
-
-..  automodule:: paddle.trainer_config_helpers.data_sources
-    :members:
diff --git a/doc/api/trainer_config_helpers/evaluators.rst b/doc/api/trainer_config_helpers/evaluators.rst
deleted file mode 100644
index 11dc735164284d6ed1d661fab1e7690d263b3a7c..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/evaluators.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-..  _api_trainer_config_helpers_evaluators:
-
-==========
-Evaluators
-==========
-
-Base
-====
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: evaluator_base
-    :noindex:
-
-Classification 
-==============
-
-classification_error_evaluator
-------------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: classification_error_evaluator
-    :noindex:
-
-auc_evaluator
--------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: auc_evaluator
-    :noindex:
-
-ctc_error_evaluator
--------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: ctc_error_evaluator
-    :noindex:
-
-chunk_evaluator
----------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: chunk_evaluator
-    :noindex:
-
-precision_recall_evaluator
---------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  precision_recall_evaluator
-    :noindex:
-
-Rank
-====
-
-pnpair_evaluator
-----------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  pnpair_evaluator
-    :noindex:
-
-Utils
-=====
-
-sum_evaluator
--------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: sum_evaluator
-    :noindex:
-
-column_sum_evaluator
---------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members: column_sum_evaluator
-    :noindex:
-
-Print
-=====
-
-classification_error_printer_evaluator
---------------------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  classification_error_printer_evaluator
-    :noindex:
-
-gradient_printer_evaluator
---------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  gradient_printer_evaluator
-    :noindex:
-
-maxid_printer_evaluator
------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  maxid_printer_evaluator
-    :noindex:
-
-maxframe_printer_evaluator
----------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  maxframe_printer_evaluator
-    :noindex:
-
-seqtext_printer_evaluator
--------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  seqtext_printer_evaluator
-    :noindex:
-
-value_printer_evaluator
------------------------
-..  automodule:: paddle.trainer_config_helpers.evaluators
-    :members:  value_printer_evaluator
-    :noindex:
-
diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
deleted file mode 100644
index 4e429650e545179eca2f947e4af660222ad7cda8..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/layers.rst
+++ /dev/null
@@ -1,469 +0,0 @@
-..  _api_trainer_config_helpers_layers:
-
-======
-Layers
-======
-
-Base
-======
-
-LayerType
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: LayerType
-    :noindex:
-
-LayerOutput
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: LayerOutput
-    :noindex:
-
-Data layer
-===========
-
-..  _api_trainer_config_helpers_layers_data_layer:
-
-data_layer
-----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: data_layer
-    :noindex:
-
-Fully Connected Layers
-======================
-
-..  _api_trainer_config_helpers_layers_fc_layer:
-
-fc_layer
---------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: fc_layer
-    :noindex:
-
-selective_fc_layer
-------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: selective_fc_layer
-    :noindex:
-
-Conv Layers
-===========
-
-conv_operator
--------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: conv_operator
-    :noindex:
-
-conv_projection
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: conv_projection
-    :noindex:
-
-conv_shift_layer
-------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: conv_shift_layer
-    :noindex:
-
-img_conv_layer
---------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: img_conv_layer
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_context_projection:
-
-context_projection 
-------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: context_projection
-    :noindex:
-
-Image Pooling Layer
-===================
-
-img_pool_layer
---------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: img_pool_layer
-    :noindex:   
-
-spp_layer
---------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: spp_layer
-    :noindex:
-
-maxout_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: maxout_layer
-    :noindex:
-
-Norm Layer
-==========
-
-img_cmrnorm_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: img_cmrnorm_layer
-    :noindex:
-
-batch_norm_layer
----------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: batch_norm_layer
-    :noindex:
-
-sum_to_one_norm_layer
----------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sum_to_one_norm_layer
-    :noindex:
-    
-Recurrent Layers
-================
-
-recurrent_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: recurrent_layer
-    :noindex:
-
-lstmemory
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lstmemory
-    :noindex:
-
-lstm_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lstm_step_layer
-    :noindex:
-
-grumemory
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: grumemory
-    :noindex:
-
-gru_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: gru_step_layer
-    :noindex:
-
-Recurrent Layer Group
-=====================
-
-memory
-------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: memory
-    :noindex:
-
-recurrent_group
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: recurrent_group
-    :noindex:
-    
-beam_search
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: beam_search
-    :noindex:
-    
-get_output_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: get_output_layer
-    :noindex:
-    
-Mixed Layer
-===========
-
-..  _api_trainer_config_helpers_layers_mixed_layer:
-
-mixed_layer
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: mixed_layer
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_embedding_layer:
-
-embedding_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: embedding_layer
-    :noindex:
-
-scaling_projection
-------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: scaling_projection
-    :noindex:
-
-dotmul_projection
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: dotmul_projection
-    :noindex:
-
-dotmul_operator
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: dotmul_operator
-    :noindex:
-
-full_matrix_projection
-----------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: full_matrix_projection
-    :noindex:
-
-identity_projection
--------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: identity_projection
-    :noindex:
-
-
-table_projection
-----------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: table_projection
-    :noindex:
-
-trans_full_matrix_projection
-----------------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: trans_full_matrix_projection
-    :noindex:
-    
-Aggregate Layers
-================
-
-..  _api_trainer_config_helpers_layers_pooling_layer:
-
-pooling_layer
--------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: pooling_layer
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_last_seq:
-
-last_seq
---------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: last_seq
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_first_seq:
-
-first_seq
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: first_seq
-    :noindex:
-
-concat_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: concat_layer
-    :noindex:
-
-Reshaping Layers
-================
-
-block_expand_layer
-------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: block_expand_layer
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_expand_layer:
-
-expand_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: expand_layer
-    :noindex:
-
-repeat_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: repeat_layer
-    :noindex:
-
-Math Layers
-===========
-
-addto_layer
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: addto_layer
-    :noindex:
-
-linear_comb_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: linear_comb_layer
-    :noindex:
-
-interpolation_layer
--------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: interpolation_layer
-    :noindex:
-
-bilinear_interp_layer
-----------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: bilinear_interp_layer
-    :noindex:
-
-power_layer
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: power_layer
-    :noindex:
-
-scaling_layer
--------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: scaling_layer
-    :noindex:
-
-slope_intercept_layer
-----------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: slope_intercept_layer
-    :noindex:
-
-tensor_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: tensor_layer
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_cos_sim:
-
-cos_sim
--------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cos_sim
-    :noindex:
-
-trans_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: trans_layer
-    :noindex:
-
-Sampling Layers
-===============
-
-maxid_layer
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: maxid_layer
-    :noindex:
-
-sampling_id_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sampling_id_layer
-    :noindex:
-
-..  _api_trainer_config_helpers_layers_cost_layers:
-
-Cost Layers
-===========
-
-cross_entropy
--------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cross_entropy
-    :noindex:
-
-cross_entropy_with_selfnorm
----------------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cross_entropy_with_selfnorm
-    :noindex:
-
-multi_binary_label_cross_entropy
---------------------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: multi_binary_label_cross_entropy
-    :noindex:
-
-huber_cost
-----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: huber_cost
-    :noindex:
-
-lambda_cost
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lambda_cost
-    :noindex:
-
-rank_cost
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: rank_cost
-    :noindex:
-
-crf_layer
------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: crf_layer
-    :noindex:
-
-crf_decoding_layer
--------------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: crf_decoding_layer
-    :noindex:
-
-ctc_layer
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: ctc_layer
-    :noindex:
-
-nce_layer
------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: nce_layer
-    :noindex:
-
-hsigmoid
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: hsigmoid
-    :noindex:
-
-sum_cost
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sum_cost
-    :noindex:
-
-Check Layer 
-============
-
-eos_layer
-------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: eos_layer
-    :noindex:
diff --git a/doc/api/trainer_config_helpers/optimizers.rst b/doc/api/trainer_config_helpers/optimizers.rst
deleted file mode 100644
index d2f4958c92b8e3b7426945f1af07112ab4071136..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/optimizers.rst
+++ /dev/null
@@ -1,61 +0,0 @@
-..  _api_trainer_config_helpers_optimizers:
-
-==========
-Optimizers
-==========
-
-BaseSGDOptimizer
-================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: BaseSGDOptimizer
-    :noindex:
-
-MomentumOptimizer
-=================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: MomentumOptimizer
-    :noindex:
-
-AdamOptimizer
-=============
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdamOptimizer
-    :noindex:
-
-AdamaxOptimizer
-================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdamaxOptimizer
-    :noindex:
-
-AdaGradOptimizer
-================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdaGradOptimizer
-    :noindex:
-
-DecayedAdaGradOptimizer
-=======================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: DecayedAdaGradOptimizer
-    :noindex:
-
-AdaDeltaOptimizer
-=================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: AdaDeltaOptimizer
-    :noindex:
-
-RMSPropOptimizer
-================
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: RMSPropOptimizer
-    :noindex:
-
-..  _api_trainer_config_helpers_optimizers_settings:
-
-settings
-========
-..  automodule:: paddle.trainer_config_helpers.optimizers
-    :members: settings
-    :noindex:
diff --git a/doc/api/trainer_config_helpers/poolings.rst b/doc/api/trainer_config_helpers/poolings.rst
deleted file mode 100644
index 66566809d26f59263597b5286c5b27e0bbc9415a..0000000000000000000000000000000000000000
--- a/doc/api/trainer_config_helpers/poolings.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-========
-Poolings
-========
-
-BasePoolingType
-===============
-..  automodule:: paddle.trainer_config_helpers.poolings
-    :members: BasePoolingType
-    :noindex:
-
-AvgPooling
-==========
-..  automodule:: paddle.trainer_config_helpers.poolings
-    :members: AvgPooling
-    :noindex:
-
-MaxPooling
-==========
-..  automodule:: paddle.trainer_config_helpers.poolings
-    :members: MaxPooling
-    :noindex:
-
-SumPooling
-==========
-..  automodule:: paddle.trainer_config_helpers.poolings
-    :members: SumPooling
-    :noindex:
-
-SquareRootNPooling
-==================
-..  automodule:: paddle.trainer_config_helpers.poolings
-    :members: SquareRootNPooling
-    :noindex:
diff --git a/doc/api/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst
similarity index 100%
rename from doc/api/data_provider/dataprovider_cn.rst
rename to doc/api/v1/data_provider/dataprovider_cn.rst
diff --git a/doc/api/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst
similarity index 100%
rename from doc/api/data_provider/dataprovider_en.rst
rename to doc/api/v1/data_provider/dataprovider_en.rst
diff --git a/doc/api/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst
similarity index 100%
rename from doc/api/data_provider/pydataprovider2_cn.rst
rename to doc/api/v1/data_provider/pydataprovider2_cn.rst
diff --git a/doc/api/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst
similarity index 99%
rename from doc/api/data_provider/pydataprovider2_en.rst
rename to doc/api/v1/data_provider/pydataprovider2_en.rst
index 30357be32538db4423ad0eaf899138256c84edc7..e8fb6292779790765154502bff319ea10ab1e70b 100644
--- a/doc/api/data_provider/pydataprovider2_en.rst
+++ b/doc/api/v1/data_provider/pydataprovider2_en.rst
@@ -178,7 +178,7 @@ input_types
 +++++++++++
 
 PaddlePaddle has four data types, and three sequence types.
-The four data types are: 
+The four data types are:
 
 * :code:`dense_vector`: dense float vector.
 * :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and
@@ -231,7 +231,7 @@ Its parameters lists as follows:
     * :code:`is_train` is a bool parameter that indicates the DataProvider is used in
       training or testing.
     * :code:`file_list` is the list of all files.
-      
+
   * User-defined parameters args can be set in training configuration.
 
 Note, PaddlePaddle reserves the right to add pre-defined parameter, so please
diff --git a/doc/api/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
similarity index 100%
rename from doc/api/data_provider/src/mnist_config.py
rename to doc/api/v1/data_provider/src/mnist_config.py
diff --git a/doc/api/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
similarity index 100%
rename from doc/api/data_provider/src/mnist_provider.dict.py
rename to doc/api/v1/data_provider/src/mnist_provider.dict.py
diff --git a/doc/api/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt
similarity index 100%
rename from doc/api/data_provider/src/mnist_train.txt
rename to doc/api/v1/data_provider/src/mnist_train.txt
diff --git a/doc/api/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
similarity index 100%
rename from doc/api/data_provider/src/sentimental_config.py
rename to doc/api/v1/data_provider/src/sentimental_config.py
diff --git a/doc/api/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
similarity index 100%
rename from doc/api/data_provider/src/sentimental_provider.py
rename to doc/api/v1/data_provider/src/sentimental_provider.py
diff --git a/doc/api/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt
similarity index 100%
rename from doc/api/data_provider/src/sentimental_train.txt
rename to doc/api/v1/data_provider/src/sentimental_train.txt
diff --git a/doc/api/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list
similarity index 100%
rename from doc/api/data_provider/src/train.list
rename to doc/api/v1/data_provider/src/train.list
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3718cd73a2003b8ef6c406a9bd51dc68e76402dc
--- /dev/null
+++ b/doc/api/v1/index_cn.rst
@@ -0,0 +1,37 @@
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10c297a71d6988c002de868e804ed9ee2345fbd7
--- /dev/null
+++ b/doc/api/v1/index_en.rst
@@ -0,0 +1,37 @@
+API
+===
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_en.rst
+    data_provider/pydataprovider2_en.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_en.rst
diff --git a/doc/api/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py
similarity index 100%
rename from doc/api/predict/src/predict_sample.py
rename to doc/api/v1/predict/src/predict_sample.py
diff --git a/doc/api/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst
similarity index 100%
rename from doc/api/predict/swig_py_paddle_cn.rst
rename to doc/api/v1/predict/swig_py_paddle_cn.rst
diff --git a/doc/api/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst
similarity index 100%
rename from doc/api/predict/swig_py_paddle_en.rst
rename to doc/api/v1/predict/swig_py_paddle_en.rst
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eca3ce03bcdc599edca802d8dfca48d4f28275a2
--- /dev/null
+++ b/doc/api/v2/config/activation.rst
@@ -0,0 +1,101 @@
+===========
+Activation
+===========
+
+Abs
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Abs
+    :noindex:
+    
+Exp
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Exp
+    :noindex:
+    
+Identity
+========
+
+..  automodule:: paddle.v2.activation
+    :members: Identity
+    :noindex:
+    
+Linear
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Linear
+    :noindex:
+
+Log
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Log
+    :noindex:
+    
+Square
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Square
+    :noindex:
+    
+Sigmoid
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Sigmoid
+    :noindex:
+    
+Softmax
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Softmax
+    :noindex:
+    
+SequenceSoftmax
+===============
+
+..  automodule:: paddle.v2.activation
+    :members: SequenceSoftmax
+    :noindex:
+    
+Relu
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Relu
+    :noindex:
+    
+BRelu
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: BRelu
+    :noindex:
+    
+SoftRelu
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftRelu
+    :noindex:
+    
+Tanh
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Tanh
+    :noindex:
+    
+STanh
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: STanh
+    :noindex:
diff --git a/doc/api/v2/config/attr.rst b/doc/api/v2/config/attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a93f41b86779200d8bac651614f4d61f4895875f
--- /dev/null
+++ b/doc/api/v2/config/attr.rst
@@ -0,0 +1,6 @@
+Parameter Attribute
+===================
+
+..  automodule:: paddle.v2.attr
+    :members:
+    :noindex:
diff --git a/doc/api/v2/config/evaluators.rst b/doc/api/v2/config/evaluators.rst
new file mode 100644
index 0000000000000000000000000000000000000000..39db51fa4abc370855ca3f2778b47464f33b6fce
--- /dev/null
+++ b/doc/api/v2/config/evaluators.rst
@@ -0,0 +1,101 @@
+..  _api_v2:
+
+==========
+Evaluators
+==========
+
+Classification 
+==============
+
+classification_error
+--------------------
+..  automodule:: paddle.v2.evaluator
+    :members: classification_error
+    :noindex:
+
+auc
+---
+..  automodule:: paddle.v2.evaluator
+    :members: auc
+    :noindex:
+
+ctc_error
+---------
+..  automodule:: paddle.v2.evaluator
+    :members: ctc_error
+    :noindex:
+
+chunk
+-----
+..  automodule:: paddle.v2.evaluator
+    :members: chunk
+    :noindex:
+
+precision_recall
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  precision_recall
+    :noindex:
+
+Rank
+====
+
+pnpair
+------
+..  automodule:: paddle.v2.evaluator
+    :members:  pnpair
+    :noindex:
+
+Utils
+=====
+
+sum
+---
+..  automodule:: paddle.v2.evaluator
+    :members: sum
+    :noindex:
+
+column_sum
+----------
+..  automodule:: paddle.v2.evaluator
+    :members: column_sum
+    :noindex:
+
+Print
+=====
+
+classification_error_printer
+----------------------------
+..  automodule:: paddle.v2.evaluator
+    :members:  classification_error_printer
+    :noindex:
+
+gradient_printer
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  gradient_printer
+    :noindex:
+
+maxid_printer
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  maxid_printer
+    :noindex:
+
+maxframe_printer
+----------------
+..  automodule:: paddle.v2.evaluator
+    :members:  maxframe_printer
+    :noindex:
+
+seqtext_printer
+---------------
+..  automodule:: paddle.v2.evaluator
+    :members:  seqtext_printer
+    :noindex:
+
+value_printer
+-------------
+..  automodule:: paddle.v2.evaluator
+    :members:  value_printer
+    :noindex:
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..154cfe24432f3e43ed724a45273b4a582b45f73d
--- /dev/null
+++ b/doc/api/v2/config/layer.rst
@@ -0,0 +1,433 @@
+..  _api_v2.layer:
+
+======
+Layers
+======
+
+Data layer
+===========
+
+..  _api_v2.layer_data:
+
+data
+----
+..  autoclass:: paddle.v2.layer.data
+    :noindex:
+
+Fully Connected Layers
+======================
+
+..  _api_v2.layer_fc:
+
+fc
+--
+..  autoclass:: paddle.v2.layer.fc
+    :noindex:
+
+selective_fc
+------------
+..  autoclass:: paddle.v2.layer.selective_fc
+    :noindex:
+
+Conv Layers
+===========
+
+conv_operator
+-------------
+..  autoclass:: paddle.v2.layer.conv_operator
+    :noindex:
+
+conv_projection
+---------------
+..  autoclass:: paddle.v2.layer.conv_projection
+    :noindex:
+
+conv_shift
+----------
+..  autoclass:: paddle.v2.layer.conv_shift
+    :noindex:
+
+img_conv
+--------
+..  autoclass:: paddle.v2.layer.img_conv
+    :noindex:
+
+..  _api_v2.layer_context_projection:
+
+context_projection 
+------------------
+..  autoclass:: paddle.v2.layer.context_projection
+    :noindex:
+
+Image Pooling Layer
+===================
+
+img_pool
+--------
+..  autoclass:: paddle.v2.layer.img_pool
+    :noindex:   
+
+spp
+---
+..  autoclass:: paddle.v2.layer.spp
+    :noindex:
+
+maxout
+------
+..  autoclass:: paddle.v2.layer.maxout
+    :noindex:
+
+Norm Layer
+==========
+
+img_cmrnorm
+-----------
+..  autoclass:: paddle.v2.layer.img_cmrnorm
+    :noindex:
+
+batch_norm
+----------
+..  autoclass:: paddle.v2.layer.batch_norm
+    :noindex:
+
+sum_to_one_norm
+---------------
+..  autoclass:: paddle.v2.layer.sum_to_one_norm
+    :noindex:
+    
+cross_channel_norm
+------------------
+..  autoclass:: paddle.v2.layer.cross_channel_norm
+    :noindex:
+    
+Recurrent Layers
+================
+
+recurrent
+---------
+..  autoclass:: paddle.v2.layer.recurrent
+    :noindex:
+
+lstmemory
+---------
+..  autoclass:: paddle.v2.layer.lstmemory
+    :noindex:
+
+grumemory
+---------
+..  autoclass:: paddle.v2.layer.grumemory
+    :noindex:
+
+Recurrent Layer Group
+=====================
+
+memory
+------
+..  autoclass:: paddle.v2.layer.memory
+    :noindex:
+
+recurrent_group
+---------------
+..  autoclass:: paddle.v2.layer.recurrent_group
+    :noindex:
+    
+lstm_step
+---------
+..  autoclass:: paddle.v2.layer.lstm_step
+    :noindex:
+
+gru_step
+--------
+..  autoclass:: paddle.v2.layer.gru_step
+    :noindex:
+
+beam_search
+------------
+..  autoclass:: paddle.v2.layer.beam_search
+    :noindex:
+    
+get_output
+----------
+..  autoclass:: paddle.v2.layer.get_output
+    :noindex:
+    
+Mixed Layer
+===========
+
+..  _api_v2.layer_mixed:
+
+mixed
+-----
+..  autoclass:: paddle.v2.layer.mixed
+    :noindex:
+
+..  _api_v2.layer_embedding:
+
+embedding
+---------
+..  autoclass:: paddle.v2.layer.embedding
+    :noindex:
+
+scaling_projection
+------------------
+..  autoclass:: paddle.v2.layer.scaling_projection
+    :noindex:
+
+dotmul_projection
+-----------------
+..  autoclass:: paddle.v2.layer.dotmul_projection
+    :noindex:
+
+dotmul_operator
+---------------
+..  autoclass:: paddle.v2.layer.dotmul_operator
+    :noindex:
+
+full_matrix_projection
+----------------------
+..  autoclass:: paddle.v2.layer.full_matrix_projection
+    :noindex:
+
+identity_projection
+-------------------
+..  autoclass:: paddle.v2.layer.identity_projection
+    :noindex:
+
+
+table_projection
+----------------
+..  autoclass:: paddle.v2.layer.table_projection
+    :noindex:
+
+trans_full_matrix_projection
+----------------------------
+..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
+    :noindex:
+    
+Aggregate Layers
+================
+
+..  _api_v2.layer_pooling:
+
+pooling
+-------
+..  autoclass:: paddle.v2.layer.pooling
+    :noindex:
+
+..  _api_v2.layer_last_seq:
+
+last_seq
+--------
+..  autoclass:: paddle.v2.layer.last_seq
+    :noindex:
+
+..  _api_v2.layer_first_seq:
+
+first_seq
+---------
+..  autoclass:: paddle.v2.layer.first_seq
+    :noindex:
+
+concat
+------
+..  autoclass:: paddle.v2.layer.concat
+    :noindex:
+
+seq_concat
+----------
+..  autoclass:: paddle.v2.layer.seq_concat
+    :noindex:
+
+Reshaping Layers
+================
+
+block_expand
+------------
+..  autoclass:: paddle.v2.layer.block_expand
+    :noindex:
+
+..  _api_v2.layer_expand:
+
+expand
+------
+..  autoclass:: paddle.v2.layer.expand
+    :noindex:
+
+repeat
+------
+..  autoclass:: paddle.v2.layer.repeat
+    :noindex:
+
+rotate
+------
+..  autoclass:: paddle.v2.layer.rotate
+    :noindex:
+
+seq_reshape
+-----------
+..  autoclass:: paddle.v2.layer.seq_reshape
+    :noindex:
+
+Math Layers
+===========
+
+addto
+-----
+..  autoclass:: paddle.v2.layer.addto
+    :noindex:
+
+linear_comb
+-----------
+..  autoclass:: paddle.v2.layer.linear_comb
+    :noindex:
+
+interpolation
+-------------
+..  autoclass:: paddle.v2.layer.interpolation
+    :noindex:
+
+bilinear_interp
+---------------
+..  autoclass:: paddle.v2.layer.bilinear_interp
+    :noindex:
+
+power
+-----
+..  autoclass:: paddle.v2.layer.power
+    :noindex:
+
+scaling
+-------
+..  autoclass:: paddle.v2.layer.scaling
+    :noindex:
+
+slope_intercept
+---------------
+..  autoclass:: paddle.v2.layer.slope_intercept
+    :noindex:
+
+tensor
+------
+..  autoclass:: paddle.v2.layer.tensor
+    :noindex:
+
+..  _api_v2.layer_cos_sim:
+
+cos_sim
+-------
+..  autoclass:: paddle.v2.layer.cos_sim
+    :noindex:
+
+trans
+-----
+..  autoclass:: paddle.v2.layer.trans
+    :noindex:
+
+Sampling Layers
+===============
+
+maxid
+-----
+..  autoclass:: paddle.v2.layer.max_id
+    :noindex:
+
+sampling_id
+-----------
+..  autoclass:: paddle.v2.layer.sampling_id
+    :noindex:
+
+Slicing and Joining Layers
+==========================
+
+pad
+----
+..  autoclass:: paddle.v2.layer.pad
+    :noindex:
+
+..  _api_v2.layer_costs:
+
+Cost Layers
+===========
+
+cross_entropy_cost
+------------------
+..  autoclass:: paddle.v2.layer.cross_entropy_cost
+    :noindex:
+
+cross_entropy_with_selfnorm_cost
+--------------------------------
+..  autoclass:: paddle.v2.layer.cross_entropy_with_selfnorm_cost
+    :noindex:
+
+multi_binary_label_cross_entropy_cost
+-------------------------------------
+..  autoclass:: paddle.v2.layer.multi_binary_label_cross_entropy_cost
+    :noindex:
+
+huber_cost
+----------
+..  autoclass:: paddle.v2.layer.huber_cost
+    :noindex:
+
+lambda_cost
+-----------
+..  autoclass:: paddle.v2.layer.lambda_cost
+    :noindex:
+
+mse_cost
+--------
+..  autoclass:: paddle.v2.layer.mse_cost
+    :noindex:
+
+rank_cost
+---------
+..  autoclass:: paddle.v2.layer.rank_cost
+    :noindex:
+
+sum_cost
+---------
+..  autoclass:: paddle.v2.layer.sum_cost
+    :noindex:
+
+crf
+---
+..  autoclass:: paddle.v2.layer.crf
+    :noindex:
+
+crf_decoding
+------------
+..  autoclass:: paddle.v2.layer.crf_decoding
+    :noindex:
+
+ctc
+---
+..  autoclass:: paddle.v2.layer.ctc
+    :noindex:
+
+warp_ctc
+--------
+..  autoclass:: paddle.v2.layer.warp_ctc
+    :noindex:
+
+nce
+---
+..  autoclass:: paddle.v2.layer.nce
+    :noindex:
+
+hsigmoid
+---------
+..  autoclass:: paddle.v2.layer.hsigmoid
+    :noindex:
+
+smooth_l1_cost
+--------------
+..  autoclass:: paddle.v2.layer.smooth_l1_cost
+    :noindex:
+
+Check Layer 
+============
+
+eos
+---
+..  autoclass:: paddle.v2.layer.eos
+    :noindex:
diff --git a/doc/api/trainer_config_helpers/networks.rst b/doc/api/v2/config/networks.rst
similarity index 57%
rename from doc/api/trainer_config_helpers/networks.rst
rename to doc/api/v2/config/networks.rst
index edb53acbf0c31532aa34bda044066fed72eaa426..b2a617fff134035c04eeabbbaf6d9cbe2a525f1c 100644
--- a/doc/api/trainer_config_helpers/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -2,14 +2,14 @@
 Networks
 ========
 
-The networks module contains pieces of neural network that combine multiple layers.
+The v2.networks module contains pieces of neural network that combine multiple layers.
 
 NLP
 ===
 
 sequence_conv_pool
 ------------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: sequence_conv_pool
     :noindex:
 
@@ -17,7 +17,7 @@ sequence_conv_pool
 
 text_conv_pool
 --------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: text_conv_pool
     :noindex:
 
@@ -26,13 +26,13 @@ Images
 
 img_conv_bn_pool
 ----------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: img_conv_bn_pool
     :noindex:
 
 img_conv_group
 --------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: img_conv_group
     :noindex:
 
@@ -40,13 +40,19 @@ img_conv_group
 
 simple_img_conv_pool
 --------------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: simple_img_conv_pool
     :noindex:
 
+small_vgg
+---------
+..  automodule:: paddle.v2.networks
+    :members: small_vgg
+    :noindex:
+
 vgg_16_network
 ---------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: vgg_16_network
     :noindex:
 
@@ -58,25 +64,25 @@ LSTM
 
 lstmemory_unit
 ``````````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: lstmemory_unit
     :noindex:
 
 lstmemory_group
 ```````````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: lstmemory_group
     :noindex:
 
 simple_lstm
 ```````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: simple_lstm
     :noindex:
 
 bidirectional_lstm
 ``````````````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: bidirectional_lstm
     :noindex:
 
@@ -85,25 +91,37 @@ GRU
 
 gru_unit
 ````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: gru_unit
     :noindex:
 
 gru_group
 `````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: gru_group
     :noindex:
 
 simple_gru
 ``````````
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: simple_gru
     :noindex:
 
+simple_gru2
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru2
+    :noindex:
+
+bidirectional_gru
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_gru
+    :noindex:
+
 simple_attention
 ----------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: simple_attention
     :noindex:
 
@@ -112,12 +130,6 @@ Miscs
 
 dropout_layer
 --------------
-..  automodule:: paddle.trainer_config_helpers.networks
+..  automodule:: paddle.v2.networks
     :members: dropout_layer
     :noindex:
-
-outputs
--------
-..  automodule:: paddle.trainer_config_helpers.networks
-    :members: outputs
-    :noindex:
diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b32373fdef52a7aa9d64b12cda3f76cb2abf351b
--- /dev/null
+++ b/doc/api/v2/config/optimizer.rst
@@ -0,0 +1,45 @@
+==========
+Optimizer
+==========
+
+Momentum
+========
+..  automodule:: paddle.v2.optimizer
+    :members: Momentum
+    :noindex:
+
+Adam
+====
+..  automodule:: paddle.v2.optimizer
+    :members: Adam
+    :noindex:
+
+Adamax
+======
+..  automodule:: paddle.v2.optimizer
+    :members: Adamax
+    :noindex:
+
+AdaGrad
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: AdaGrad
+    :noindex:
+
+DecayedAdaGrad
+==============
+..  automodule:: paddle.v2.optimizer
+    :members: DecayedAdaGrad
+    :noindex:
+
+AdaDelta
+========
+..  automodule:: paddle.v2.optimizer
+    :members: AdaDelta
+    :noindex:
+
+RMSProp
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: RMSProp
+    :noindex:
diff --git a/doc/api/v2/config/pooling.rst b/doc/api/v2/config/pooling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d26b365c9284632210a1532853e39feedc70758b
--- /dev/null
+++ b/doc/api/v2/config/pooling.rst
@@ -0,0 +1,46 @@
+=======
+Pooling
+=======
+
+BasePool
+========
+..  automodule:: paddle.v2.pooling
+    :members: BasePool
+    :noindex:
+
+Avg
+===
+..  automodule:: paddle.v2.pooling
+    :members: Avg
+    :noindex:
+
+Max
+===
+..  automodule:: paddle.v2.pooling
+    :members: Max
+    :noindex:
+
+Sum
+===
+..  automodule:: paddle.v2.pooling
+    :members: Sum
+    :noindex:
+
+SquareRootN
+===========
+..  automodule:: paddle.v2.pooling
+    :members: SquareRootN
+    :noindex:
+
+CudnnAvg
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnAvg
+    :noindex:
+
+CudnnMax
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnMax
+    :noindex:
+
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fef87c4fbdb452771ecdb361c6eeae5b32bcee14
--- /dev/null
+++ b/doc/api/v2/data.rst
@@ -0,0 +1,113 @@
+==================================
+Data Reader Interface and DataSets
+==================================
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
+
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members: get_dict,get_embedding,test
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+..  autoclass:: paddle.v2.dataset.movielens.MovieInfo
+    :noindex:
+    
+..  autoclass:: paddle.v2.dataset.movielens.UserInfo
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.wmt14
+    :members:
+    :noindex:
+
diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..992b559cbd87244612521d4c96f84f997d6c4196
--- /dev/null
+++ b/doc/api/v2/model_configs.rst
@@ -0,0 +1,13 @@
+Model Configuration
+===================
+
+..  toctree::
+    :maxdepth: 1
+
+    config/activation.rst
+    config/layer.rst
+    config/evaluators.rst
+    config/optimizer.rst
+    config/pooling.rst
+    config/networks.rst
+    config/attr.rst
diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5c97651f6536d89d2b5926d4b2907a547aa86b55
--- /dev/null
+++ b/doc/api/v2/run_logic.rst
@@ -0,0 +1,31 @@
+======================
+Training and Inference
+======================
+
+Parameters
+==========
+
+..  automodule:: paddle.v2.parameters
+    :members: Parameters
+    :noindex:
+
+Trainer
+=======
+
+..  automodule:: paddle.v2.trainer
+    :members: SGD
+    :noindex:
+
+Event
+=====
+
+..  automodule:: paddle.v2.event
+    :members:
+    :noindex:
+
+Inference
+=========
+
+..  autofunction:: paddle.v2.infer
+    :noindex:
+    
\ No newline at end of file
diff --git a/doc/design/api.md b/doc/design/api.md
new file mode 100644
index 0000000000000000000000000000000000000000..8185d2af0ea264a2e7b4e28b9ed05279e4a22014
--- /dev/null
+++ b/doc/design/api.md
@@ -0,0 +1,262 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems at neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
+this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code if `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/design/cluster_train/README.md b/doc/design/cluster_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..74961f80050c6b2723889b51416a2e8048174b00
--- /dev/null
+++ b/doc/design/cluster_train/README.md
@@ -0,0 +1,183 @@
+# Design Doc: Distributed Training
+
+## Objective
+
+In [this slides](https://www.slideshare.net/cxwangyi/paddlepaddle-a-complete-solution-for-businesses), we explained that we'd like PaddlePaddle running on general-purpose clusters like those managed by Kubernetes, so to address demands for AI from both Internet and non-Internet industries.
+
+This poses technical challenges to PaddlePaddle:
+
+1. Support fault-recovery.
+1. Support both offline and online training.
+1. [Serverless computing](https://en.wikipedia.org/wiki/Serverless_computing) of distributed training.
+
+
+## Training Job
+
+A training job will be created once user asks Paddle cloud to train a model. The training job is made up of different processes that collaboratively consume data and produce a trained model. There are three kinds of processes:
+
+1. the *master server process*, which dispatches tasks to
+1. one or more *trainer processes*, which run distributed training and synchronize gradients/models via
+1. one or more *parameter server processes*, where each holds a shard of the global model, and receive the uploaded gradients from every *trainer process*, so they can run the optimize functions to update their parameters.
+
+Their relation is illustrated in the following graph:
+
+<img src="src/paddle-model-sharding.png"/>
+
+By coordinating these processes, PaddlePaddle supports use both Synchronize Stochastic Gradient Descent (sync SGD) and Asynchronous Stochastic Gradient Descent (async SGD) to train user-defined neural network topologies.
+
+When training with sync SGD, parameter servers wait for all trainers to finish gradients update and then send the updated parameters to trainers, training can not proceed until the trainer received the updated parameters. This creates a synchronization point between trainers. When training with async SGD, each trainer upload gradient and download new parameters individually, without the synchronization with other trainers. Using asyc SGD will be faster in terms of time per pass, but have more noise in gradient since trainers are likely to have a stale model.
+
+### Master Server Process
+
+The master server process will:
+
+- Partition a dataset into [tasks](#task) and dispatch tasks to trainers.
+- Keep track of training progress on the dataset with [task queue](#task-queue). A training job will iterate on the dataset for a full pass until it goes into next pass.
+
+
+#### Task
+
+A task is a data shard to be trained. The total number of tasks will be much bigger than the total number of trainers. The number of data instances inside a task will be much bigger than the mini-batch size.
+
+#### Task Queue
+
+The master server has three task queues to track training progress. As illustrated in the graph below, Job A and Job B both have one master server. Each master server process has three task queues.
+
+<img src="src/paddle-task-queues.png"/>
+
+- The todo queue holds tasks to be dispatched. When a job starts, the master server fills in the todo queue with all tasks.
+- The pending queue holds tasks that are currently training by trainers.
+- the done queue holds tasks that are already trained.
+
+The life cycle of a single task is illustrated below:
+
+<img src="src/paddle-task-states.png"/>
+
+1. When a new pass of training starts, all tasks will be placed in the todo queue.
+1. The master server will dispatch few tasks to each trainer at a time, puts them in the pending queue and waits for completion.
+1. The trainer will work on its tasks and tell the master server once a task is completed. The master server will dispatch a new task to that trainer.
+1. If a task timeout. the master server will move it back to the todo queue. The timeout count will increase by one. If the timeout count is above a threshold, the task is likely to cause a trainer to crash, so it will be discarded.
+1. The master server will move completed task to the done queue. When the todo queue is empty, the master server will start a new pass by moving all tasks in the done queue to todo queue and reset the timeout counter of all tasks to zero.
+
+### Trainer Process
+
+The trainer process will:
+
+- Receive tasks from the master.
+- Work on the tasks: calculate and upload gradient to parameter servers, and update local model by downloading new parameters from parameter servers.
+
+### Parameter Server Process
+
+Parameter server processes hold the parameters collaboratively. The parameters are partitioned on different parameter servers.
+
+The parameter server will:
+
+- Receive gradient from the trainers, update its parameters, and give the trainers the latest parameters.
+- Periodically save its parameters to distributed file system by overriding the previous save.
+
+### Optimization Algorithms
+
+The communication pattern between the trainers and the parameter servers depends on the category of optimization algorithm:
+
+- Synchronous Stochastic Gradient Descent (sync-SGD)
+
+	Parameter server will wait for all trainer finish n-th mini-batch calculation and send their gradients before broadcasting new parameters to every trainer. Every trainer will wait for the new parameters before starting n+1-th mini-batch.
+
+- Asynchronous Stochastic Gradient Descent (async-SGD)
+
+	There will no synchronization between different trainers, and parameter server updates its parameter as soon as it receives new gradient:
+
+	- Each trainer uploads its accumulated gradient every n mini-batches.
+	- Every m mini-batches, the trainer downloads new parameters from parameter server.
+	- n and m do not have to be equal.
+
+## Fault Tolerant
+
+The training job will pause if the master server processes is dead, or any of the parameter server process is dead. They will be started by [Kubernetes](https://kubernetes.io/) and recover in few minutes. Please refer to [fault recovery](#fault-recovery).
+
+The training job will continue to make progress if there is at least one training process running. The strategy depends on the type of optimization algorithm:
+
+- sync-SGD
+
+	TODO
+
+- async-SGD
+
+	Since async-SGD does not require synchronization between mini-batches, the system will by definition make process if at least one trainer is running.
+
+## Fault Recovery
+
+PaddlePaddle uses [etcd](https://github.com/coreos/etcd) to keep track of the states of processes. Because etcd is a distributed reliable key-value store, the restarted process can recover its states from etcd. The model parameters are periodically saved into distributed file system, so a restarted parameter server can recover its parameters from the saved file.
+
+Now we will introduce how each process recovers from a failure, the graph below shows how etcd is used:
+
+<img src="src/paddle-etcd.png"/>
+
+### Master Server Process
+
+When the master is started by the Kubernetes, it executes the following steps at startup:
+
+1. Grabs a unique *master* lock in etcd, which prevents concurrent master instantiations.
+1. Recovers the task queues from etcd if they already exist, otherwise, the master will create them.
+1. Watches the trainer prefix keys `/trainer/` on etcd to find the live trainers.
+1. Starts dispatching the tasks to the trainers, and updates task queue using an etcd transaction to ensure lock is held during the update.
+
+When the master server process is dead for any reason, Kubernetes will restart it. It will be online again with all states recovered from etcd in few minutes.
+
+### Trainer Process
+
+When the trainer is started by the Kubernetes, it executes the following steps at startup:
+
+1. Watches the available parameter server prefix keys `/ps/` on etcd and waits until the count of parameter servers reaches the desired count.
+1. Generates a unique ID, and sets key `/trainer/<unique ID>` with its contact address as value. The key will be deleted when the lease expires, so the master will be aware of the trainer being online and offline.
+1. Waits for tasks from the master to start training.
+
+If trainer's etcd lease expires, it will try set key `/trainer/<unique ID>` again so that the master server can discover the trainer again.
+
+When a trainer fails, Kuberentes would try to restart it. The recovered trainer would fetch tasks from the TODO queue and go on training.
+
+### Parameter Server Process
+
+When the parameter server is started by Kubernetes, it executes the following steps at startup:
+
+1. Read desired total number of parameter servers from etcd `/ps_desired`
+1. Search through etcd keys `/ps/<index>` (`/ps/0`, `/ps/1`, ...) to find the first non-existant key whose index is smaller than the total number of parameter servers. Set the key using a transaction to avoid concurrent writes. The parameter server's index is inferred from the key name.
+
+	The desired number of parameter servers is 3:
+
+	<img src="src/paddle-ps-0.png"/>
+
+	The third parameter server joined:
+
+	<img src="src/paddle-ps-1.png"/>
+
+1. The parameter server can load parameters if there are already saved parameters in the save path (inferred from its index).
+1. Now the parameter server is ready for the trainers' requests.
+
+If the parameter server's etcd lease expires, the parameter server will kill itself.
+
+
+## Parameter Server Checkpointing
+See [here](./checkpointing.md)
+
+## Store and dispatching trainning data
+See [here](./data_dispatch.md)
+
+
+## Dynamic Scaling
+
+### Trainer Scaling
+
+TODO
+
+### Parameter Server Scaling
+
+Not planned for v1.
+
+## Training Dataset Format
+
+TODO
+
+## User Interface
+
+TODO
diff --git a/doc/design/cluster_train/checkpointing.md b/doc/design/cluster_train/checkpointing.md
new file mode 100644
index 0000000000000000000000000000000000000000..c87ef2c7d2636208866d05456d5d44316d0bb200
--- /dev/null
+++ b/doc/design/cluster_train/checkpointing.md
@@ -0,0 +1,44 @@
+## 模型参数检查点（Checkpointing）
+模型数据检查点的实现，可以有效的避免parameter server的单点或多点同时故障。模型参数检查点通过定期向磁盘上保存一份存储在parameter server内存中的模型数据的完整镜像，来保证训练过程可以从中间状态重新启动。在一个不可中断并缺少备份的训练任务中，可以通过阶段性的保存每个parameter server的数据快照（snapshot）到 ***分布式存储服务*** 达到容灾的目的，比如每隔10分钟最新的快照，并删除更早的快照。在出现单点故障时，只需要恢复这台节点，或者将这台节点迁移到另一个节点并启动即可恢复训练任务。
+
+<img src="src/checkpointing.png" width="500"/>
+
+### 快照保存的设计如下：
+
+说明：
+
+* parameter server在集群中启动后，自动挂载分布式存储目录，并把快照保存到这个目录下。
+* ***注：每个parameter server的检查点各自独立保存，暂时不考虑多个parameter server同步的保存一个特定时间点的全局检查点，因为这样做也没法保证消除随机性。***
+
+检查点保存程序流程：
+
+1. 如果满足条件"每隔10分钟"时，parameter server会获取parameters内存的`read_lock`，启动一个新的线程开始保存检查点。如果已经正在执行保存检查点的线程，则忽略。由于对parameters的更新需要获取parameters内存的`write_lock`，所以在写入快照的过程中，parameter server会暂停参数更新并等待。
+2. parameter server生成一个UUID，向指定的目录中一个新的文件（文件名为此UUID）写入快照数据。在快照写入完成后，计算这个文件的MD5 sum。然后在etcd的`/checkpoints/[pserver_id]`中写入json内容：`{"uuid": [UUID], "md5", "MD5 sum", "timestamp": xxxx}`。
+3. 删除磁盘目录中不是当前uuid的快照文件。
+4. 释放对paramters内存的锁定，停止保存检查点的线程。
+
+这里需要用户额外注意，在您的实际环境中，训练任务的运行可能会占满trainer和parameter server之间的网络带宽，如果parameter server此时还需要通过网络访问分布式存储以保存快照，可能会造成网络拥塞，而出现阶段性的运行停滞。
+
+### 从快照恢复
+
+在parameter server第一次启动或任意时间parameter server故障后被Kubernetes重新启动，则需要回滚到上一个检查点：
+
+  1. 从etcd中读取节点：`/checkpoints/[pserver_id]`获取最新的检查点的文件uuid
+  1. 从磁盘文件中加载uuid文件名的检查点快照文件，并加载其中的参数
+  1. 如果上面两步出现错误，则使用启动参数定义的初始化方法初始化参数
+  1. 开始提供服务
+
+## TODO List
+### 推测执行/加速执行（TODO）
+在异构集群中，如果存在某些trainer执行速度过慢会影响整体集群的速度（如图中Trainer 1），此时master将负责启动一个新的Trainer（Accelerate Trainer 2），使用同样的训练数据block。哪个trainer先完成block的训练，则把另一个慢速的kill掉。
+
+### 动态扩容/缩容
+目前只考虑动态扩容trainer数量，可以减小系统复杂性。
+
+## 术语
+* model: 指深度学习训练之后得到的所有参数，使用这个神经网络可以完成对新数据的预测
+* parameters: 神经网络中的参数，包括权重w和偏置b。一个神经网络的模型由大量的参数组成
+* shard: 分片，通常指将一个整体拆分成多份的其中的一份。
+* model shard: 将一个神经网络参数拆分成多份，每个shard分别存储在其中一台parameter server之上
+* parameter block: 多个parameter block构成一个model shard
+* 单点故障: 任意时刻只可能同时有一台服务器故障。由于集群中同时存在两台机器故障的概率极低（（平均故障率*平均故障修复时间）^2）只对特殊在线系统考虑两台以上同时故障的容灾。
diff --git a/doc/design/cluster_train/data_dispatch.md b/doc/design/cluster_train/data_dispatch.md
new file mode 100644
index 0000000000000000000000000000000000000000..1f5d22ff5e6abcb576d16cbe7391da1967a1ab8e
--- /dev/null
+++ b/doc/design/cluster_train/data_dispatch.md
@@ -0,0 +1,160 @@
+## 训练数据的存储和分发
+
+### 概念解释
+
+### 流程介绍
+生产环境中的训练数据集通常体积很大，并被存储在诸如Hadoop HDFS，Ceph，AWS S3之类的分布式存储之上。这些分布式存储服务通常会把数据切割成多个分片分布式的存储在多个节点之上。这样就可以在云端执行多种数据类计算任务，包括：
+
+* 数据预处理任务
+* Paddle训练任务
+* 在线模型预测服务
+<div style="align: center">
+<img src="src/paddle-cloud-in-data-center.png" width="800"/>
+</div>
+
+在上图中显示了在一个实际生产环境中的应用（人脸识别）的数据流图。生产环境的日志数据会通过实时流的方式（Kafka）和离线数据的方式（HDFS）存储，并在集群中运行多个分布式数据处理任务，比如流式数据处理（online data process），离线批处理（offline data process）完成数据的预处理，提供给paddle作为训练数据。用户也可以上传labeled data到分布式存储补充训练数据。在paddle之上运行的深度学习训练输出的模型会提供给在线人脸识别的应用使用。
+
+### 训练数据存储
+我们选择[CephFS](http://docs.ceph.com/docs/master/cephfs/)作为存储系统。
+
+- 无论是从[PFSClient](../file_manager/README.md)的角度，还是从[Pod](https://kubernetes.io/docs/concepts/workloads/pods/pod/)中运行任务的角度，统一用`/pfs/$DATACENTER/home/$USER`来访问用户自己的数据。  
+- `/pfs/$DATACENTER/common`下存放公共数据集合
+	- 做只读挂载 
+
+<div style="align: center">
+<img src="src/file_storage.png" width="700" align=center/>
+</div>
+
+### 文件预处理
+
+
+在开始训练之前, 数据集需要预先被转换成PaddlePaddle分布式训练使用的存储格[RecordIO](https://github.com/PaddlePaddle/Paddle/issues/1947)。我们提供两个转换方式：
+
+1. 用户在本地转换好再上传
+1. 用户上传数据后，在机群上运行转换程序
+
+转换生成的文件名会是以下格式：
+
+```text
+name_prefix-aaaaa-of-bbbbb
+```
+
+"aaaaa"和"bbbbb"都是五位的数字，每一个文件是数据集的一个shard，"aaaaa"代表shard的index，"bbbbb"代表这个shard的最大index。
+
+比如ImageNet这个数据集可能被分成1000个shard，它们的文件名是：
+```text
+imagenet-00000-of-00999
+imagenet-00001-of-00999
+...
+imagenet-00999-of-00999
+```
+
+#### 转换库
+
+无论是在本地或是云端转换，我们都提供Python的转换库，接口是：
+```python
+def convert(output_path, reader, num_shards, name_prefix)
+```
+
+- `output_path`: directory in which output files will be saved.
+- `reader`: a [data reader](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#data-reader-interface), from which the convert program will read data instances.
+- `num_shards`: the number of shards that the dataset will be partitioned into.
+- `name_prefix`: the name prefix of generated files.
+
+`reader`每次输出一个data instance，这个instance可以是单个值，或者用tuple表示的多个值：
+
+```python
+yield 1 # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28) # 单个值
+yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+```
+
+每个值的类型可以是整形、浮点型数据、字符串，或者由它们组成的list，以及numpy.ndarray。如果是其它类型，会被Pickle序列化成字符串。
+
+### 示例程序
+
+#### 使用转换库
+
+以下`reader_creator`生成的`reader`每次输出一个data instance，每个data instance包涵两个值：numpy.ndarray类型的值和整型的值：
+```python
+def reader_creator():
+	def reader():
+		for i in range(1000):
+			yield numpy.random.uniform(-1, 1, size=28*28), 0 # 多个值
+	return reader
+```
+
+把`reader_creator`生成的`reader`传入`convert`函数即可完成转换：
+```python
+convert("./", reader_creator(), 100, random_images)
+```
+
+以上命令会在当前目录下生成100个文件：
+```text
+random_images-00000-of-00099
+random_images-00001-of-00099
+...
+random_images-00099-of-00099
+```
+
+#### 进行训练
+
+
+PaddlePaddle提供专用的[data reader creator](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/reader/README.md#python-data-reader-design-doc)，生成给定`RecordIO`文件对应的data reader。**无论在本地还是在云端，reader的使用方式都是一致的**：
+
+```python
+# ...
+reader = paddle.reader.creator.RecordIO("/pfs/datacenter_name/home/user_name/random_images-*-of-*")
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+trainer.train(batch_reader, ...)
+```
+
+以上代码的reader输出的data instance与生成数据集时，reader输出的data instance是一模一样的。
+
+### 上传训练文件
+
+使用下面命令，可以把本地的数据上传到存储集群中。
+
+```bash  
+paddle pfs cp filename /pfs/$DATACENTER/home/$USER/folder/
+```
+
+比如，把之前示例中转换完毕的random_images数据集上传到云端的`/home/`可以用以下指令：
+
+```bash  
+paddle pfs cp random_images-*-of-* /pfs/$DATACENTER/home/$USER/folder/
+```
+
+需要`$DATACENTER`的配置写到配置文件中，例如
+
+```
+# config file
+[datacenter_1]
+username=user
+usercert=user.pem
+userkey=user-key.pem
+endpoint=datacenter1.paddlepaddle.org
+
+[datacenter_2]
+username=user
+usercert=user.pem
+userkey=user-key.pem
+endpoint=datacenter2.paddlepaddle.org
+```
+## TODO
+### 文件访问的权限
+控制用户权限  
+
+- 用户可以把自己的数据分享给别人
+
+### 文件访问方式
+不用mount的方式来访问数据，而是直接用API的接口远程访问
+
+例如：  
+
+```
+f = open('/pfs/datacenter_name/home/user_name/test1.dat')
+```
+
+
+### 支持用户自定义的数据预处理job
diff --git a/doc/design/cluster_train/master_server.md b/doc/design/cluster_train/master_server.md
new file mode 100644
index 0000000000000000000000000000000000000000..bb8307652587b4dc56cd668a3a5e64722734d194
--- /dev/null
+++ b/doc/design/cluster_train/master_server.md
@@ -0,0 +1,91 @@
+# Design Doc: Master Server
+
+For an overview of master server's role, please refer to [distributed training design doc](./README.md). In this design doc we will discuss the master server in more details. The master will be implemented in [Go](https://golang.org/).
+
+## Dataset
+
+<img src="src/dataset.png"/>
+
+A dataset is a list of files in *RecordIO* format. A RecordIO file consists of chunks, whereas each chunk consists some records.
+
+## Task Queue
+
+As mentioned in [distributed training design doc](./README.md), a *task* is a data shard that the master server assigns to the trainer process to train on. A task consists of one or multiple *blocks* from one or multiple files. The master server maintains *task queues* to track the training progress.
+
+### Task Queue Creation
+
+1. Each trainer will make an RPC call (using Go's [rpc](https://golang.org/pkg/net/rpc/) package) to the master server, telling it the RecordIO files representing the dataset specified by the user. Since every trainer will tell the master server the same dataset, only the first RPC call will be honored.
+
+	The RPC interface is:
+	```go
+	func (m *RPCServer) ReportDataset(Paths []string, dummy *int) error {
+	}
+	```
+1. The master server will scan through each RecordIO file to generate the *block index* and know how many blocks does each file have. A block can be referenced by the file path and the index of the block within the file. The block index is in memory data structure that enables fast access to each block, and the index of the block with the file is an integer start from 0, representing the n-th block within the file.
+
+	The definition of the block is:
+	```go
+	type Block struct {
+		Idx   int // index of the block within the file
+		Path  string
+		Index recordio.Index // block index
+	}
+	```
+1. Blocks are grouped into tasks, and tasks are filled into the todo queue. The pending queue and the done queue are initialized with no element.
+
+	The definition of the task is:
+	```go
+	type Task struct {
+		Index  int
+		Blocks []Block
+	}
+	```
+
+	The elements in the tasks queues is of type `TaskEntry`, containing a timeout counter (described in [task retry logic](#task-retry-logic)), and a task:
+	```go
+	type TaskEntry struct {
+		NumTimeout int
+		Task       Task
+	}
+	```
+
+	The definition of task queues is:
+	```go
+	type TaskQueues struct {
+		Todo    []TaskEntry
+		Pending map[int]TaskEntry // map from task index to task entry
+		Done    []TaskEntry
+	}
+	```
+
+### Task Queue Persistence
+
+The task queues need to be persisted on [etcd](https://github.com/coreos/etcd) for fault recovery. Since the task queues only change once a task is completed or timed out, which is not very frequent, we can afford to synchronize with etcd every time the task queues change.
+
+We will serialize the task queues data structure with [gob encoding](https://golang.org/pkg/encoding/gob/), compress with gzip, and save into etcd synchronously under key `/task_queues`.
+
+### Task Dispatch
+
+The trainer will make an RPC call to master to get a new task when:
+
+- the trainer first started, or
+- the trainer finishes a task.
+
+The RPC interface is:
+```go
+func (m *RPCServer) GetTask(finished *Task, result *Task) error {
+}
+```
+Argument `finished` will be `nil` when the trainer is just started.
+
+During the RPC call the master will do the following:
+
+- Make a copy of the task queues, and update the copy reflecting the finished tasks and the new pending tasks.
+- Synchronize the copy of task queues with etcd using a transaction conditioned on holding the master lock.
+- Replace the task queues with the copy and report to the trainer with the new tasks if succeeded, or discard the copy and report the error to the trainer if failed.
+
+### Task Retry Logic
+
+When a task is dispatched to the trainer, the master will schedule a function for execution after the timeout duration (based on the moving average of task completion time). If the task entry in still in the pending queue, its timeout counter will increase by one, and the task will be moved to todo queue. If the timeout counter is above the threshold, the master will log the error and discard the task.
+
+Please note that since a timed out task could be completed after it has been dispatched for retry, so it is possible for a task to be processed multiple times. We do not try to prevent it from happening since it's fine to train on the same task multiple times due to the stochastic nature of the stochastic gradient decent algorithm.
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
new file mode 100644
index 0000000000000000000000000000000000000000..392bab25e9de6bf5aa7cc1b0ad345ef12f1d9e5d
--- /dev/null
+++ b/doc/design/cluster_train/pserver_client.md
@@ -0,0 +1,157 @@
+# Design Doc: The Client Library of Parameter Server
+
+For an overview of trainer's role, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter server's client library, which will manage communication with parameter servers. The library will be implemented in [Go](https://golang.org/) and made available as a static or dynamic library with a C header file.
+
+## Parameter Partition
+
+Each parameter will be partitioned into parameter blocks to make the parameters evenly distributed on parameter servers. The partition is done automatically by the client library. The *sparse parameter* require a little different treatment:
+
+### Sparse Parameter
+
+The sparse parameter is a parameter that is updated sparsely. The name is somewhat misleading, it does not have a sparse representation, it has the same representation as a dense vector.
+
+Because a sparse parameter is updated sparsely, the trainer will have to partition the sparse parameter. Because the parameter server will merge all sparse parameter shard into the same file when saving the parameter. It needs special naming convention:
+
+If a sparse parameter is partitioned into n shards, they should be named as:
+
+```text
+name:sparse-0
+name:sparse-1
+...
+name:sparse-n-1
+```
+
+The library is unaware of the partition, and treat each parameter independently. Only when saving parameters, the parameter servers will merge the sparse parameters according to the naming convention.
+
+## Model Optimization Using Gradients
+
+There are two ways to perform model optimization using gradients:
+
+- On Client
+
+  The client does multiple steps of forward and backward update. In each step, the gradients are calculated and a new model is generated. After some steps, the client will calculate the difference between the newest model and the old model at step 0. The difference will be updated to parameter servers. Parameter servers will just update parameters using the difference without any optimization using gradients (such as Adam and L1 regularization).
+
+- On Parameter Server
+
+  The client will send accumulated gradients to parameter servers, the parameter server will do the optimization using gradients.
+
+## L1 and L2 Regularization
+
+PaddlePaddle allows L1 or L2 regularizations to be specified per parameter, so when the trainer initializes the parameter it needs include a parameter configuration when L1 or L2 regularization is necessary.
+
+## Parameter Initialization
+
+The parameters on parameter servers need to be initialized. To provide maximum flexibility, the trainer will initialize the parameters. Only one trainer will do the initialization, the other trainers will wait for the completion of initialization and get the parameters from the parameter servers.
+
+### Trainer Selection
+
+To select the trainer for initialization, every trainer will try to get a distributed lock, whoever owns the lock will do the initialization. As illustrated below:
+
+<img src="./src/init_lock.png">
+
+### Trainer Selection Process
+
+The trainer select process is encapsulated in the C API function:
+```c
+int paddle_begin_init_params(paddle_pserver_client* client, const char* config_proto);
+```
+The selected trainer's call to `paddle_begin_init_params` will return with 1, and the other trainers' call to `paddle_begin_init_params` will block until initialization is done, and return 0. As illustrated below:
+
+<img src="./src/pserver_init.png">
+
+## C Interface
+
+```c
+typedef enum {
+  PADDLE_ELEMENT_TYPE_INT32   = 0,
+  PADDLE_ELEMENT_TYPE_UINT32  = 1,
+  PADDLE_ELEMENT_TYPE_INT64   = 2,
+  PADDLE_ELEMENT_TYPE_UINT64  = 3,
+  PADDLE_ELEMENT_TYPE_FLOAT32 = 4,
+  PADDLE_ELEMENT_TYPE_FLOAT64 = 5,
+} paddle_element_type;
+
+typedef struct {
+  char*               name;
+  paddle_element_type element_type;
+  void*               content;
+  int                 content_len;
+} paddle_parameter, paddle_gradient;
+
+typedef struct paddle_pserver_client paddle_pserver_client;
+
+paddle_pserver_client* paddle_new_pserver_client();
+void paddle_pserver_client_release(paddle_pserver_client* client);
+
+/**
+ * @brief paddle_begin_init_params begins to initialize parameters on
+ * parameter servers.
+ *
+ * paddle_begin_init_params will be called from multiple trainers,
+ * only one trainer will be selected to initialize the parameters on
+ * parameter servers. Other trainers will be blocked until the
+ * initialization is done, and they need to get the initialized
+ * parameters from parameter servers using @paddle_get_params.
+ *
+ * @param pserver_config_proto serialized parameter server configuration in
+ * Protocol Buffers format.
+ * @return 1 if the trainer is selected to initialize parameter
+ * servers, otherwise 0.
+ */
+int paddle_begin_init_params(paddle_pserver_client* client, const char* pserver_config_proto);
+
+/**
+ * @brief paddle_init_param initializes the parameter on parameter
+ * servers.
+ *
+ * @param param the parameter to initialize.
+ * @param param_config_proto the configuration for the parameter.
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_init_param(paddle_pserver_client* client, paddle_parameter params, const char* param_config_proto);
+
+/**
+ * @brief paddle_finish_init_params tells parameter servers client has
+ * sent all parameters to parameter servers as initialization.
+ *
+ * @return 0 if successful, otherwise -1. On failure, the trainer
+ * needs to restart the entire initialization process (starting from
+ * @paddle_begin_init_param). Or simply exit the program and wait for
+ * the cluster management system to restart the trainer.
+ */
+int paddle_finish_init_params(paddle_pserver_client* client);
+
+/**
+ * @brief paddle_send_grads sends gradients to parameter servers for
+ * updating parameters.
+ *
+ * @param grads the array of gradients to send.
+ * @param len the length of the gradient array.
+ * @param learning_rate the learning rate for the gradients.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+
+/**
+ * @brief paddle_get_params gets parameters from parameter servers.
+ *
+ * @param names the array of names of the parameters to get.
+ * @param dst the destination array of parameters to save to.
+ * @param len the length of the names array and the paddle_parameter
+ * array.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+
+/**
+ * @brief paddle_save_model indicates parameters to save the parameter
+ * to the given path
+ *
+ * @param path the path to save parameters.
+ * @return 0 if successful, otherwise -1.
+ */
+int paddle_save_model(paddle_pserver_client* client, const char* path);
+```
diff --git a/doc/design/cluster_train/src/checkpointing.png b/doc/design/cluster_train/src/checkpointing.png
new file mode 100644
index 0000000000000000000000000000000000000000..c221e8474f90f37e31416cbb19c9452207a0d14c
Binary files /dev/null and b/doc/design/cluster_train/src/checkpointing.png differ
diff --git a/doc/design/cluster_train/src/data_dispatch.png b/doc/design/cluster_train/src/data_dispatch.png
new file mode 100644
index 0000000000000000000000000000000000000000..5bdcc24d6a6d193cb014f8c38b362451fded5e54
Binary files /dev/null and b/doc/design/cluster_train/src/data_dispatch.png differ
diff --git a/doc/design/cluster_train/src/dataset.graffle b/doc/design/cluster_train/src/dataset.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..c10a423ed16a23229a9ee33d11bfc82bb59646c8
Binary files /dev/null and b/doc/design/cluster_train/src/dataset.graffle differ
diff --git a/doc/design/cluster_train/src/dataset.png b/doc/design/cluster_train/src/dataset.png
new file mode 100644
index 0000000000000000000000000000000000000000..2fb7f1cce3b6dd21489392557826e95a9f207c34
Binary files /dev/null and b/doc/design/cluster_train/src/dataset.png differ
diff --git a/doc/design/cluster_train/src/file_storage.graffle b/doc/design/cluster_train/src/file_storage.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..50a17e70fa255495337c529a3bf12a5c0024a5be
Binary files /dev/null and b/doc/design/cluster_train/src/file_storage.graffle differ
diff --git a/doc/design/cluster_train/src/file_storage.png b/doc/design/cluster_train/src/file_storage.png
new file mode 100644
index 0000000000000000000000000000000000000000..fccb4e3e7e738224c7f1584326bd5f351ce799aa
Binary files /dev/null and b/doc/design/cluster_train/src/file_storage.png differ
diff --git a/doc/design/cluster_train/src/init_lock.graffle b/doc/design/cluster_train/src/init_lock.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fa9149f21b1311eed48ef72ec55e556559d0fc94
Binary files /dev/null and b/doc/design/cluster_train/src/init_lock.graffle differ
diff --git a/doc/design/cluster_train/src/init_lock.png b/doc/design/cluster_train/src/init_lock.png
new file mode 100644
index 0000000000000000000000000000000000000000..92404ee6d6c0f9a7727952bae3c869ba338ecd7f
Binary files /dev/null and b/doc/design/cluster_train/src/init_lock.png differ
diff --git a/doc/design/cluster_train/src/paddle-cloud-in-data-center.png b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png
new file mode 100644
index 0000000000000000000000000000000000000000..da5d1a77562480ad1d886f5f21dbd84001d3d508
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-cloud-in-data-center.png differ
diff --git a/doc/design/cluster_train/src/paddle-etcd.graffle b/doc/design/cluster_train/src/paddle-etcd.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..56681ae5bbe11849116d621b066a6317e003e4ca
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-etcd.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-etcd.png b/doc/design/cluster_train/src/paddle-etcd.png
new file mode 100644
index 0000000000000000000000000000000000000000..4f9c9762b3a8c089dd5e9b2c07cb9dfc78296a21
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-etcd.png differ
diff --git a/doc/design/cluster_train/src/paddle-model-sharding.graffle b/doc/design/cluster_train/src/paddle-model-sharding.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..fba30f0ca2b47f0d202a432821d95e55aac37ec8
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-model-sharding.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-model-sharding.png b/doc/design/cluster_train/src/paddle-model-sharding.png
new file mode 100644
index 0000000000000000000000000000000000000000..8c3f6724ef46c6527e63a4cd8cb0b50fe0167124
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-model-sharding.png differ
diff --git a/doc/design/cluster_train/src/paddle-ps-0.png b/doc/design/cluster_train/src/paddle-ps-0.png
new file mode 100644
index 0000000000000000000000000000000000000000..47ef32806f182cab003da77f1556823b3f6d1721
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-ps-0.png differ
diff --git a/doc/design/cluster_train/src/paddle-ps-1.png b/doc/design/cluster_train/src/paddle-ps-1.png
new file mode 100644
index 0000000000000000000000000000000000000000..f3125db73096c52bac6e7c60e1675552857c0774
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-ps-1.png differ
diff --git a/doc/design/cluster_train/src/paddle-ps.graffle b/doc/design/cluster_train/src/paddle-ps.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..0e536ffdd91cd696008b4c01bad3cb53edebdc16
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-ps.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-task-queues.graffle b/doc/design/cluster_train/src/paddle-task-queues.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4263ed8bfd2ef0e55058828bf23f2fac3595e5fd
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-task-queues.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-task-queues.png b/doc/design/cluster_train/src/paddle-task-queues.png
new file mode 100644
index 0000000000000000000000000000000000000000..5f980266795776752cebd0c346b85c4a75a47780
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-task-queues.png differ
diff --git a/doc/design/cluster_train/src/paddle-task-states.graffle b/doc/design/cluster_train/src/paddle-task-states.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..cf1a0b9246d9386a949d2dbb8c32fe84f72eea83
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-task-states.graffle differ
diff --git a/doc/design/cluster_train/src/paddle-task-states.png b/doc/design/cluster_train/src/paddle-task-states.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae43cb66c071aee9eb90d875e2373b29af9c3e0
Binary files /dev/null and b/doc/design/cluster_train/src/paddle-task-states.png differ
diff --git a/doc/design/cluster_train/src/pserver_init.graffle b/doc/design/cluster_train/src/pserver_init.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..730d3a561ffdc19e723b3cf6612471440951826a
Binary files /dev/null and b/doc/design/cluster_train/src/pserver_init.graffle differ
diff --git a/doc/design/cluster_train/src/pserver_init.png b/doc/design/cluster_train/src/pserver_init.png
new file mode 100644
index 0000000000000000000000000000000000000000..4d502226d82ba271c50ae1bec5efaaaac4cc4434
Binary files /dev/null and b/doc/design/cluster_train/src/pserver_init.png differ
diff --git a/doc/design/cluster_train/src/submit-job.graffle b/doc/design/cluster_train/src/submit-job.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..677cdfb6d9a32168bf71729eb841fa1ca0dd31d6
Binary files /dev/null and b/doc/design/cluster_train/src/submit-job.graffle differ
diff --git a/doc/design/cluster_train/src/submit-job.png b/doc/design/cluster_train/src/submit-job.png
new file mode 100644
index 0000000000000000000000000000000000000000..3046a460a7ba708079e88a560debaa215a694680
Binary files /dev/null and b/doc/design/cluster_train/src/submit-job.png differ
diff --git a/doc/design/cluster_train/src/trainer.graffle b/doc/design/cluster_train/src/trainer.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..42384a3f059966e22e22f5fa4295cc9ead5cef83
Binary files /dev/null and b/doc/design/cluster_train/src/trainer.graffle differ
diff --git a/doc/design/cluster_train/src/trainer.png b/doc/design/cluster_train/src/trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6537d3d56589ca9f19a77a50a970e4b5275e6ce0
Binary files /dev/null and b/doc/design/cluster_train/src/trainer.png differ
diff --git a/doc/design/cluster_train/submit-job.md b/doc/design/cluster_train/submit-job.md
new file mode 100644
index 0000000000000000000000000000000000000000..8377d5489dc64bd2fdc5bb4f7bc737e7b489000d
--- /dev/null
+++ b/doc/design/cluster_train/submit-job.md
@@ -0,0 +1,127 @@
+# Submit a Distributed Training Job
+
+The user can submit a distributed training job with Python code, rather than with a command-line interface.
+
+## Runtime Environment On Kubernetes
+
+For a distributed training job, there is two Docker image called *runtime Docker image* and *base Docker image*. The runtime Docker image is the Docker image that gets scheduled by Kubernetes to run during training. The base Docker image is for building the runtime Docker image.
+
+### Base Docker Image
+
+Usually, the base Docker image is PaddlePaddle product Docker image including paddle binary files and python package. And of course, users can specify any image name hosted on any docker registry which users have the access right.
+
+### Runtime Docker Image
+
+The trainer package which user upload and some Python dependencies are packaged into a runtime Docker image based on base Docker image.
+
+- Handle Python Dependencies
+
+  You need to provide requirements.txt file in your `trainer-package` folder. Example:
+
+  ```txt
+  pillow
+  protobuf==3.1.0
+  ```
+  More [details](https://pip.readthedocs.io/en/1.1/requirements.html) about requirements, an example project looks like:
+  ```bash
+    paddle_example
+      |-quick_start
+        |-trainer.py
+        |-dataset.py
+        |-requirements.txt
+  ```
+
+## Submit Distributed Training Job With Python Code
+<img src="./src/submit-job.png" width="800">
+
+- `paddle.job.dist_train()` will call the Job Server API `/v1/packages` to upload the trainer package and save them on CephFS, and then call `/v1/trainer/job` to submit the PaddlePaddle distributed job.
+- `/v1/trainer/job` will start a building job for preparing the runtime Docker image. When the building job is finished, Job Server will submit the PaddlePaddle distributed job to Kubernetes.
+- *NOTE*: For the first version, we will not prepare the runtime Docker image, instead, the package is uploaded to Paddle Cloud, and Paddle Cloud will mount the package in a temporary folder into the base Docker image. We will not support custom Python dependencies in the first version as well.
+
+You can call `paddle.job.dist_train` and provide distributed training configuration as the parameters:
+```python
+paddle.job.dist_train(
+  trainer=dist_trainer(),
+  paddle_job=PaddleJob(
+    job_name = "paddle-cloud",
+    entry_point = "python %s"%__file__,
+    trainer_package = "/example/word2vec",
+    image = "yancey1989/paddle-job",
+    trainers = 10,
+    pservers = 3,
+    trainer_cpu = 1,
+    trainer_gpu = 1,
+    trainer_mem = "10G",
+    pserver_cpu = 1,
+    pserver_mem = "2G"
+  ))
+```
+
+The parameter `trainer` of `paddle.job.dist_train` is a function and you can implement it as follows:
+```python
+def dist_trainer():
+  def trainer_creator():
+    trainer = paddle.v2.trainer.SGD(...)
+    trainer.train(...)
+  return trainer_creator
+```
+
+The pseudo code of `paddle.job.dist_train` is as follows:
+```python
+def dist_train(trainer, paddle_job):
+  # if the code is running on cloud, set PADDLE_ON_CLOUD=YES
+  if os.getenv("RUNNING_ON_CLOUD", "NO") == "NO":
+    #submit the paddle job
+    paddle_job.submit()
+  else:
+    #start the training
+    trainer()
+```
+### PaddleJob Parameters
+parameter | type | explanation
+ --- | --- | ---
+job_name | str | the unique name for the training job
+entry_point | str | entry point for startup trainer process
+trainer_package | str | trainer package file path which user have the access right
+image|str|the [base image](#base-docker-image) for building the [runtime image](#runtime-docker-image)
+pservers|int| Parameter Server process count
+trainers|int| Trainer process count
+pserver_cpu|int| CPU count for each Parameter Server process
+pserver_mem|str| memory allocated for each Parameter Server process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_cpu|int| CPU count for each Trainer process
+trainer_mem|str| memory allocated for each Trainer process, a plain integer using one of these suffixes: E, P, T, G, M, K
+trainer_gpu|int| GPU count for each Trainer process, if you only want CPU, do not set this parameter
+
+### Deploy Parameter Server, Trainer and Master Process
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
+
+## Job Server
+
+- RESTful API
+
+  Job server provides RESTful HTTP API for receiving the trainer package and displaying
+  PaddlePaddle job related informations.
+  - `POST   /v1/package` receive the trainer package and save them on CephFS
+  - `POST   /v1/trainer/job` submit a trainer job
+  - `GET    /v1/jobs/` list all jobs
+  - `GET    /v1/jobs/<job-name>` the status of a job
+  - `DELETE /v1/jobs/<job-name>` delete a job
+  - `GET    /v1/version` job server version
+
+- Build Runtime Docker Image on Kubernetes
+
+  `paddle.job.dist_train` will upload the trainer package to Job Server, save them on the distributed filesystem, and then start up a job for building the runtime Docker image that gets scheduled by Kubernetes to run during training.
+
+  There are some benefits for building runtime Docker image on JobServer:
+  - On Paddle Cloud, users will run the trainer code in a Jupyter Notebook which is a Kubernetes Pod, if we want to execute `docker build` in the Pod, we should mount the host's `docker.sock` to the Pod, user's code will connect the host's Docker Engine directly, it's not safe.
+  - Users only need to upload the training package files, does not need to install docker engine, docker registry as dependencies.
+  - If we want to change another image type, such as RKT, users do not need to care about it.
+
+- Deploy Parameter Server, Trainer and Master Processes
+
+  `POST /v1/trainer/job` receives the distributed training parameters, and deploy the job as follows:
+  - Deploy PaddlePaddle Parameter Server processes, it's a Kubernetes ReplicaSet.
+  - Deploy PaddlePaddle Trainer processes, it's a Kubernetes Job.
+  - Deploy PaddlePaddle Master processes, it's a Kubernetes ReplicaSet.
diff --git a/doc/design/images/replica.png b/doc/design/images/replica.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef59e56b01d792a059279e6bb9a29f3db6a59a41
Binary files /dev/null and b/doc/design/images/replica.png differ
diff --git a/doc/design/images/two_phase_commit.png b/doc/design/images/two_phase_commit.png
new file mode 100644
index 0000000000000000000000000000000000000000..ef6f7317bd440cc7d9fe08fcbbf2b7a542f99049
Binary files /dev/null and b/doc/design/images/two_phase_commit.png differ
diff --git a/doc/design/multi_language_interface/00.why_plain_c.md b/doc/design/multi_language_interface/00.why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..a1443093342c5a3ed698fb6b52a751dfc7cb5319
--- /dev/null
+++ b/doc/design/multi_language_interface/00.why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_get_shape(paddle_matrix matrix,
+                                     uint64_t* width,
+                                     uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/math/matrix.h"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::capi::CMatrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/capi/CMatrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class CMatrix {
+  std::shared_ptr<paddle::Matrix> mat;
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 实现
+
+参考[Inference implementation](01.inference_implementation.md)
diff --git a/doc/design/multi_language_interface/01.inference_implementation.md b/doc/design/multi_language_interface/01.inference_implementation.md
new file mode 100644
index 0000000000000000000000000000000000000000..9820284523246a062581f322616d196f575c9d29
--- /dev/null
+++ b/doc/design/multi_language_interface/01.inference_implementation.md
@@ -0,0 +1,131 @@
+# C-API 模型推断实现文档
+
+本文档描述Paddle C-API的实现细节。Paddle C-API是多语言API的基础部分。Paddle需要暴露的API很多。先实现模型推断的API，通过模型推断API的实现作为一个样例，来进行讨论。至于为什么需要C-API，请参考[Why Plain C](./00.why_plain_c.md)。
+
+## Table of Contents
+   * [C-API 模型推断实现文档](#c-api-模型推断实现文档)
+      * [暴露接口原则](#暴露接口原则)
+      * [目录结构](#目录结构)
+      * [实现方式](#实现方式)
+         * [capi.h](#capih)
+         * [具体某种类型的头文件](#具体某种类型的头文件)
+         * [capi_private.h](#capi_privateh)
+         * [具体某种类型的实现文件](#具体某种类型的实现文件)
+         * [libpaddle_capi_shared.{so, dylib}](#libpaddle_capi_sharedso-dylib)
+         * [libpaddle_capi_whole.a](#libpaddle_capi_wholea)
+         * [examples](#examples)
+      * [编译选项](#编译选项)
+
+
+## 暴露接口原则
+
+1. 所有的接口均为C接口。即使用`extern "C"`
+2. 除构造某种类型的函数(`paddle_matrix_create`等)，其他函数均返回`paddle_error`。且调用时不能抛出异常或出现运行时错误。
+3. 所有类型名为`paddle_类型名`，所有与类型相关的函数，函数名为`paddle_类型名_函数名`
+4. 如果某一个Paddle Core概念(GradientMachine/Matrix)需要被暴露到其他语言，那么
+	* 为了暴露的接口尽量简单。只暴露概念的接口，而不暴露概念的实现。即暴露`GradientMachine`或者`Matrix`但不暴露`RecurrentGradientMachine`和`CpuSparseMatrix`。
+	* 暴露这个概念必要函数。`必要`是指，即完成某一个任务的最少函数。
+5. 不在`capi`接口层做过多封装。
+	* 如果某一个Paddle概念必须要暴露，但是又过于琐碎。不在`capi`这一层进行封装，而是直接修改Paddle Core。让Paddle核心中，这一概念不再琐碎。
+
+
+## 目录结构
+
+```text
+Paddle
+  `-- paddle
+        `-- capi
+              `-- examples  # The example project for C-API.
+              `-- tests  # unittests for C-API
+              `-- capi.h  # C-API header file.
+              `-- capi_private.h  # The shared header file between implementation sources.
+              `-- matrix.{h, cpp}
+              `-- gradient_machine.{h, cpp}
+              `-- ...
+```
+
+
+Paddle的C-API目录结构如上图表所示。这个目录中除了`capi_private.h`之外的所有头文件，均会被安装到include/paddle路径下。C-API生成的二进制文件会被安装到`lib`目录下。即，安装后的目录结构为
+
+```text
+`-- include
+      `-- paddle
+             `-- capi.h
+             `-- matrix.h
+             `-- gradient_machine.h
+             `-- ...
+`-- lib
+     `-- libpaddle_capi_shared.{so, dylib}  # In mac, dynamic libary's file name extention is `dylib`
+     `-- libpaddle_capi_whole.a  # static library for all symbols of Paddle.
+```
+
+## 实现方式
+
+下面分别介绍某一类文件的实现方式。
+
+### capi.h
+
+`capi.h`是用户使用C-API时所唯一需要引入的头文件。在`capi.h`中，引入了类型的头文件，`matrix.h`, `gradient_machine.h`。在引入其他类型的头文件时，使用相对路径的引用方式。即`#include "matrix.h"`
+
+### 具体某种类型的头文件
+
+具体某种类型的头文件，即例如`matrix.h`，`gradient_machine.h`等。在这些头文件中，包含了某种类型的类型定义和暴露的全部函数。
+
+这个头文件不假设其他文件的引用顺序，即使用户直接引用某种类型的头文件，也不应该报错(虽然不鼓励这样)。如果某一个类型需要引用另一个类型，例如`gradient_machine`需要引用`matrix`，则直接引入另一种类型的头文件，即`#include "matrix.h"`。
+
+### capi_private.h
+
+`capi_prviate.h`是各个实现中共享的头文件，他主要包含了实际暴露的类型结构。在用户使用C-API时，Paddle的类型全部退化成`void *`，即`typedef paddle_matrix void*`。但，对于每种C-API暴露的类型，均是在`capi_private.h`中实现的结构体。
+
+```cpp
+struct CMatrix {
+   int type = MatrixType;
+   std::shared_ptr<paddle::Matrix> mat;
+};
+```
+
+通常，这个结构体包含两个项目。
+
+* `type`是一个类型的标志。对于每种类型，type字段均不尽相同。这样，即使C-API接受的类型全是`void *`，我们也可以确定每一个参数的类型。
+
+  ```cpp
+  void some_c_api_function(void* some_instance) {
+     int* type = (int *) some_instance;
+     switch (*type) {
+       case MatrixType:
+         CMatrix* mat = (CMatrix *) some_instance;
+         ...
+       ...
+     }
+  }
+  ```
+* 这个结构体中的另一个项目是，Paddle Core中这一类型接口的智能指针(shared_ptr)。
+	* 使用智能指针的原因是: 用户可以安全的释放某个C-API的实例，而不必在意Paddle Core是否还在使用这个实例。
+	* 例如，用户通过C-API获得了神经网络的参数实例。当用户使用完这个参数后，直接删除这个参数即可。即便Paddle Core中的模型还在使用这个参数，这个参数也不会一并删除。
+
+### 具体某种类型的实现文件
+
+具体某种类型的实现文件，即`matrix.cpp`, `gradient_machine.cpp`等文件。在这些文件中，使用C++ 11实现了C-API的接口，并且使用`extern "C"`导出这些接口。在实现过程中，对输入参数的安全性进行了必要的判断，并将C-API接口的参数转发给`Paddle Core`。
+
+### libpaddle\_capi_shared.{so, dylib}
+
+`libpaddle_capi_shared`是C-API导出的动态库。这个动态库的连接参数与Paddle的其他二进制(例如`paddle_trainer`)类似。用户可以直接使用这个动态库来引入Paddle C-API。具体使用方法为`-lpaddle_capi_shared`。
+
+### libpaddle\_capi_whole.a
+
+`libpaddle_capi_whole`是C-API导出的静态库。这个静态库包含了Paddle的全部符号。他是将`libpaddle_gserver.a`, `libpaddle_math.a`, `libpaddle_capi.a`等全部静态库中的目标文件全部打包后产生的文件。具体使用方法为`--whole-archive -lpaddle_capi_whole --no-whole-archive`。
+
+
+### examples
+
+在样例中，使用`C99`开发了模型预测的样例代码。具体请参考[example/README.md](../../../paddle/capi/examples/README.md)。
+
+## 编译选项
+
+C-API的编译选项默认关闭，打开这个编译选项，需要在cmake的时候，设置
+
+```bash
+cmake ${YOUR_SOURCE_ROOT} -DWITH_C_API=ON -DWITH_PYTHON=OFF -DWITH_SWIG_PY=OFF
+```
+
+编译C-API的时候推荐Paddle不嵌入Python解释器，也不生成`SWIG`接口，具体原因参考[Why Plain C](./00.why_plain_c.md)。
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f21f7af520df5171798326818ecb97c3bcd14a12
--- /dev/null
+++ b/doc/design/reader/README.md
@@ -0,0 +1,202 @@
+# Python Data Reader Design Doc
+
+At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+
+```
+iterable = data_reader()
+```
+
+Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+
+Here are valid outputs:
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],),
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three column of datas, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It's easy to convert from reader to batch reader:
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+Also easy to create custom batch reader:
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+
+Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why reader return only a single entry, but not a mini batch?
+
+Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+
+We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+
+### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+
+In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+
+### Why use a dictionary but not a list to provide mapping?
+
+We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create custom data reader creator
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train could be:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
new file mode 100644
index 0000000000000000000000000000000000000000..3692a5248a355cfcfd1cfd0911d43d65166921b1
--- /dev/null
+++ b/doc/design/releasing_process.md
@@ -0,0 +1,58 @@
+# Paddle发行规范
+
+Paddle使用git-flow branching model做分支管理，使用[Semantic Versioning](http://semver.org/)标准表示Paddle版本号。
+
+Paddle每次发新的版本，遵循以下流程:
+
+1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
+2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+3. 对这个版本的提交，做如下几个操作:
+	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
+	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
+	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
+		* 如果失败，记录下所有失败的例子，在这个`release/版本号`分支中，修复所有bug后，Patch号加一，返回第二步
+4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+6. 协同完成Release Note的书写
+
+
+需要注意的是:
+
+* `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试Paddle的行为。
+* 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+
+# Paddle 分支规范
+
+Paddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
+
+* Paddle的主版本库遵循[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范。其中:
+	* `master`分支为稳定(stable branch)版本分支。每一个`master`分支的版本都是经过单元测试和回归测试的版本。
+	* `develop`分支为开发(develop branch)版本分支。每一个`develop`分支的版本都经过单元测试，但并没有经过回归测试。
+	* `release/版本号`分支为每一次Release时建立的临时分支。在这个阶段的代码正在经历回归测试。
+
+* 其他用户的fork版本库并不需要严格遵守[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，但所有fork的版本库的所有分支都相当于特性分支。
+	* 建议，开发者fork的版本库使用`develop`分支同步主版本库的`develop`分支
+	* 建议，开发者fork的版本库中，再基于`develop`版本fork出自己的功能分支。
+	* 当功能分支开发完毕后，向Paddle的主版本库提交`Pull Reuqest`，进而进行代码评审。
+		* 在评审过程中，开发者修改自己的代码，可以继续在自己的功能分支提交代码。 
+
+* BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
+
+# Paddle回归测试列表
+
+本列表说明Paddle发版之前需要测试的功能点。
+
+## Paddle Book中所有章节
+
+Paddle每次发版本首先要保证Paddle Book中所有章节功能的正确性。功能的正确性包括验证Paddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。
+
+| | 新手入门章节 | 识别数字 | 图像分类 | 词向量 | 情感分析 | 语意角色标注 | 机器翻译 | 个性化推荐 |
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |
+| API.V2 + Docker + GPU  |  |  |  |  |  |  |  |  |
+| API.V2 + Docker + CPU  |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Docker + GPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Docker + CPU |  |  |  |  |  |  |  |  |
+| API.V2 + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+| API.V2 + Ubuntu + CPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Ubuntu + GPU |  |  |  |  |  |  |  |  |
+| `paddle_trainer` + Ubuntu + CPU |  |  |  |  |  |  |  |  |
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 7d425a05d46131d84ba895d0fefc3a592a9a36e1..c14160d55ec8fdb9fc552da33f3a3dac13c1a764 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -232,7 +232,19 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
 
-10. A protocol message was rejected because it was too big
+11. CMake源码编译，Paddle版本号为0.0.0
+--------------------------------------
+
+如果运行 :code:`paddle version`, 出现 :code:`PaddlePaddle 0.0.0`；或者运行 :code:`cmake ..`，出现
+
+..  code-block:: bash
+
+    CMake Warning at cmake/version.cmake:20 (message):
+      Cannot add paddle version from git tag
+          
+那么用户需要拉取所有的远程分支到本机，命令为 :code:`git fetch upstream`，然后重新cmake即可。
+
+12. A protocol message was rejected because it was too big
 ----------------------------------------------------------
 
 如果在训练NLP相关模型时，出现以下错误：
@@ -270,7 +282,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
 
-11. 如何指定GPU设备
+13. 如何指定GPU设备
 -------------------
 
 例如机器上有4块GPU，编号从0开始，指定使用2、3号GPU：
@@ -287,21 +299,15 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
 
-12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
----------------------------------------------------
-
-目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
-:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
 
-..  code-block:: bash
-
-    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-
-来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
+14. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
 
-..  code-block:: bash
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
 
-    git submodule init
-    git submodule update
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
 
-来获得所有第三方模块。
+主要的解决办法是减小学习律或者对数据进行归一化处理。
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index d01cdaaeb75ec7d02480eb9162cabaad2a947db9..428f58830e0b10c024f31238b7404c6df193eecd 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     # 线性计算网络层: ȳ = wx + b
     ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
     # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = regression_cost(input= ȳ, label=y)
+    cost = mse_cost(input= ȳ, label=y)
     outputs(cost)
 
 
@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     
     - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
     - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    - **回归误差代价层**：回归误差代价层 `mse_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index c10b897d4292d0c2b062b5c8e23466505afa408a..6775da20c2f51000f305b095d40abd27b8fa6c0e 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
         x = data_layer(name='x', size=1)
         y = data_layer(name='y', size=1)
         y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = regression_cost(input=y_predict, label=y)
+        cost = mse_cost(input=y_predict, label=y)
         outputs(cost)
 
 Some of the most fundamental usages of PaddlePaddle are demonstrated:
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index aaa07d49d3148266db27670a98c2b27db4dc0a8f..69f4501f370dcc9d603ec54a63d68568d66e832e 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Centos](#centos)
+
 
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@@ -11,32 +13,22 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
-git submodule update --init --recursive
-```
-
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
 ```
-git submodule update --remote
-```
-
 ## <span id="requirements">Requirements</span>
 
 To compile the source code, your computer must be equipped with the following dependencies.
 
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 2.8
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
+- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
-- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
-- **Python**: only python 2.7 is supported currently
+- **Python**: only support Python 2.7
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
 
 ### Options
 
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+PaddlePaddle supports some build options. 
 
 <html>
 <table> 
@@ -47,12 +39,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </tr>
 </thead>
 <tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
+<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
+<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
+<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
+<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
+<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
+<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
+<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
+<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
+<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
+<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
+<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
+<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
+<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
 </table>
 </html>
@@ -64,18 +65,16 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 
 As a simple example, consider the following:  
 
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
   
-    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
-    # install swig on ubuntu
-    sudo apt-get install swig
-    # install swig on Mac OS X
-    brew install swig
-
-    # active swig in cmake
-    cmake .. -DWITH_SWIG_PY=ON
+    # specify MKL
+    cmake .. -DMKL_ROOT=<mkl_path>
+    # or specify OpenBLAS
+    cmake .. -DOPENBLAS_ROOT=<openblas_path>
     ```
 
 2. **Doc Dependencies(optional)**
@@ -99,24 +98,21 @@ As a simple example, consider the following:
 
 ### Install Dependencies
 
-- **CPU Dependencies**
+- **Paddle Dependencies**
 
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
-    # optional
-    sudo apt-get install libgoogle-glog-dev
-    sudo apt-get install libgflags-dev
-    sudo apt-get install libgtest-dev
-    sudo pip install wheel
-    pushd /usr/src/gtest
-    cmake .
-    make
-    sudo cp *.a /usr/lib
-    popd
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
+    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
+    sudo pip install 'protobuf==3.1.0.post1'
+
+    # install cmake 3.4
+    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
+        cd .. && rm -rf cmake-3.4.1
     ```
-  
+
 - **GPU Dependencies (optional)**
 
     To build GPU version, you will need the following installed:
@@ -149,51 +145,78 @@ As usual, the best option is to create build folder under paddle project directo
 
 ```bash
 mkdir build && cd build
-cmake ..
+``` 
+
+Finally, you can build and install PaddlePaddle:
+
+```bash
+# you can add build option here, such as:    
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
+# install PaddlePaddle Python modules.
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+## <span id="centos">Build on Centos 7</span>
 
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
+### Install Dependencies
 
-As a simple example, consider the following:
+- **CPU Dependencies**
 
-- **Only CPU with swig**
+    ```bash
+    # necessary
+    sudo yum update
+    sudo yum install -y epel-release
+    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
+    ```
+  
+- **GPU Dependencies (optional)**
 
-  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
-  ```
-- **GPU with swig**
+    To build GPU version, you will need the following installed:
 
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
-  ```
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
 
-- **GPU with doc and swig**
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
+```bash
+mkdir build && cd build
+``` 
 
-Finally, you can build PaddlePaddle:
+Finally, you can build and install PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
-```
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
+# install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
 ```
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
index 3a52c8723bbccd70dd89e8913092d92813925f90..be0c1ffa451b2901ec06621dd4d886f800b4562e 100644
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -40,4 +40,4 @@ PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。
 
     cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
 
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 35234e0eb3ece3cb20d62841c1d75e60b485b9ea..da2d4234658b6ea4730346e721437cc1633c4362 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,158 +1,183 @@
-安装PaddlePaddle的Docker镜像
-============================
+PaddlePaddle的Docker容器使用方式
+================================
 
-PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Docker镜像是我们目前唯一官方支持的部署和运行方式。
+PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
-下述内容将分为如下几个类别描述。
 
-* PaddlePaddle提供的Docker镜像版本
-* 下载和运行Docker镜像
-* 注意事项
+PaddlePaddle发布的Docker镜像使用说明
+------------------------------
 
-PaddlePaddle提供的Docker镜像版本
---------------------------------
+我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
+PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
+像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
+PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
+行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 提供最新
+的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。为了方便在国
+内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您
+在国内，请把文档里命令中的paddlepaddle/paddle替换成
+docker.paddlepaddle.org/paddle。
 
-我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddle-dev/paddle` ，tag分别为
+1. 开发镜像：:code:`paddlepaddle/paddle:<version>-dev`
 
-+-----------------+------------------+------------------------+-----------------------+
-|                 |   normal         |           devel        |          demo         |
-+=================+==================+========================+=======================+
-|       CPU       | cpu-latest       | cpu-devel-latest       | cpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-|       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
-+-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
-+-----------------+------------------+------------------------+-----------------------+
+   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
+   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
+   开发镜像包含了以下工具：
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
+   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
 
-其中，横向包括三个版本，normal，devel和demo。
+   以交互容器方式运行开发镜像：
 
-* Normal: 正常的Docker image，只包括paddle的二进制
-* Devel: 包括Paddle的二进制、编译环境和源代码
-* Demo: 包括Paddle运行demo所需要的依赖
+   .. code-block:: bash
 
-纵向包括四个版本，他们是。
+      docker run -it --rm paddlepaddle/paddle:<version>-dev /bin/bash
 
-* CPU: CPU版本。需要支持AVX指令集的CPU
-* GPU: GPU版本。需要支持AVX指令集的CPU
-* CPU WITHOUT AVX: CPU版本，不支持AVX指令集的CPU也可以运行
-* GPU WITHOUT AVX: GPU版本，不需要AVX指令集的CPU也可以运行。
+   或者，可以以后台进程方式运行容器：
 
-用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU是否支持 :code:`AVX` 指令集\:
+   .. code-block:: bash
 
-..  code-block:: bash
+      docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:<version>-dev
 
-    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
+   然后用密码 :code:`root` SSH进入容器：
 
-如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddle-dev/paddle:cpu-devel-latest` 来引用这个image。
+   .. code-block:: bash
 
-PaddlePaddle提供的镜像并不包含任何命令运行，想要运行PaddlePaddle，您需要进入镜像运行PaddlePaddle
-程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`
+      ssh -p 2202 root@localhost
 
-下载和运行Docker镜像
---------------------
+   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
-为了运行PaddlePaddle的docker镜像，您需要在机器中安装好Docker。安装Docker需要您的机器
-至少具有3.10以上的linux kernel。安装方法请参考
-`Docker的官方文档 <https://docs.docker.com/engine/installation/>`_ 。如果您使用
-mac osx或者是windows机器，请参考 
-`mac osx的安装文档 <https://docs.docker.com/engine/installation/mac/>`_ 和
-`windows 的安装文档 <https://docs.docker.com/engine/installation/windows/>`_ 。
+2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
 
-您可以使用 :code:`docker pull` 命令预先下载镜像，也可以直接执行 
-:code:`docker run` 命令运行镜像。执行方法如下:
+   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-..  code-block:: bash
-    
-    $ docker run -it paddledev/paddle:cpu-latest
+   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
-即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
-cuda相关的Driver和设备映射进container中，脚本类似于
+   .. code-block:: bash
 
-..  code-block:: bash
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-    $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
+   如果输出是No，就需要选择使用no-AVX的镜像
 
-进入Docker container后，运行 :code:`paddle version` 即可打印出PaddlePaddle的版本和构建
-信息。安装完成的PaddlePaddle主体包括三个部分， :code:`paddle` 脚本， python的
-:code:`paddle` 包和 :code:`py_paddle` 包。其中\:
+   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
+   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
 
-* :code:`paddle` 脚本和 :code:`paddle` 的python包是PaddlePaddle的训练主要程序。使用 
-  :code:`paddle` 脚本可以启动PaddlePaddle的训练进程和pserver。而 :code:`paddle` 脚本
-  中的二进制使用了 :code:`paddle` 的python包来做配置文件解析等工作。
-* python包 :code:`py_paddle` 是一个swig封装的PaddlePaddle包，用来做预测和简单的定制化
-  训练。
+   .. code-block:: bash
 
-注意事项
---------
+      nvidia-docker run -it --rm paddledev/paddle:0.10.0rc1-gpu /bin/bash
 
-性能问题
-++++++++
+   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
 
-由于Docker是基于容器的轻量化虚拟方案，所以在CPU的运算性能上并不会有严重的影响。
-而GPU的驱动和设备全部映射到了容器内，所以GPU在运算性能上也不会有严重的影响。
+   .. code-block:: bash
 
-但是如果使用了高性能的网卡，例如RDMA网卡(RoCE 40GbE 或者 IB 56GbE)，或者高性能的
-以太网卡 (10GbE)。推荐使用将本地网卡，即 "--net=host" 来进行训练。而不使用docker
-的网桥来进行网络通信。
+      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:<version>-gpu
 
-远程访问问题和二次开发
-++++++++++++++++++++++
+3. 运行以及发布您的AI程序
 
-由于PaddlePaddle的Docker镜像并不包含任何预定义的运行命令。所以如果想要在后台启用ssh
-远程访问，则需要进行一定的二次开发，将ssh装入系统内并开启远程访问。二次开发可以
-使用Dockerfile构建一个全新的docker image。需要参考 
-`Dockerfile的文档 <https://docs.docker.com/engine/reference/builder/>`_ 和
-`Dockerfile的最佳实践 <https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/>`_ 
-两个文档。
+   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
 
-简单的含有ssh的Dockerfile如下：
+   .. code-block:: bash
 
-..  code-block:: bash
+      docker run -it -v $PWD:/work paddle /work/a.py
 
-    FROM paddledev/paddle:cpu-latest
+   如果要使用GPU，请运行：
 
-    MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+   .. code-block:: bash
 
-    RUN apt-get update
-    RUN apt-get install -y openssh-server
-    RUN mkdir /var/run/sshd
-    RUN echo 'root:root' | chpasswd
+      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
 
-    RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-    RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 
-    EXPOSE 22
+   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:<version>`
+   创建和发布自己的AI程序镜像。
 
-    CMD    ["/usr/sbin/sshd", "-D"]
+运行PaddlePaddle Book
+---------------------
 
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-使用该Dockerfile构建出镜像，然后运行这个container即可。相关命令为\:
+PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
 
-..  code-block:: bash
+我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
-    # cd到含有Dockerfile的路径中
-    $ docker build . -t paddle_ssh
-    # 运行这个container，将宿主机的8022端口映射到container的22端口上
-    $ docker run -d -p 8022:22  --name paddle_ssh_machine paddle_ssh
+.. code-block:: bash
 
-执行如下命令即可以关闭这个container，并且删除container中的数据\:
+    docker run -p 8888:8888 paddlepaddle/book
 
-..  code-block:: bash
-    
-    # 关闭container
-    $ docker stop paddle_ssh_machine
-    # 删除container
-    $ docker rm paddle_ssh_machine
+然后在浏览器中输入以下网址：
 
-如果想要在外部机器访问这个container，即可以使用ssh访问宿主机的8022端口。用户名为
-root，密码也是root。命令为\:
+.. code-block:: text
 
-..  code-block:: bash
+    http://localhost:8888/
 
-    $ ssh -p 8022 root@YOUR_HOST_MACHINE
+就这么简单，享受您的旅程！
 
-至此，您就可以远程的使用PaddlePaddle啦。
+通过Docker容器开发PaddlePaddle
+------------------------------
+
+开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+
+1. 制作PaddlePaddle开发镜像
+
+   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
+   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
+
+   .. code-block:: bash
+      
+      git clone https://github.com/PaddlePaddle/Paddle.git
+      cd Paddle
+      docker build -t paddle:dev .
+
+   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
+
+2. 制作PaddlePaddle生产镜像
+
+   生产镜像的生成分为两步，第一步是运行：
+
+   .. code-block:: bash
+      
+      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+
+   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+
+   第二步是运行：
+
+   .. code-block:: bash
+      
+      docker build -t paddle:prod -f build/Dockerfile ./build
+
+   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+
+3. 运行单元测试
+
+   运行以下指令：
+
+   .. code-block:: bash
+      
+      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+
+文档
+----
+
+Paddle的Docker开发镜像带有一个通过 `woboq code browser
+<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
+
+只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
+
+.. code-block:: bash
+
+   docker run -d --name paddle-cpu-doc paddle:<version>-dev
+   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+
+接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 34279a29b2e4c84aa5039f2e5ab2c6ed9a06da2f..03df497506099d2fb758bd0ab437d2c082f2b537 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -8,163 +8,244 @@ Please be aware that you will need to change `Dockers settings
 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
 of your hardware resource on Mac OS X and Windows.
 
+Working With Docker
+-------------------
 
-Development Using Docker
-------------------------
+Docker is simple as long as we understand a few basic concepts:
 
-Developers can work on PaddlePaddle using Docker.  This allows
-developers to work on different platforms -- Linux, Mac OS X, and
-Windows -- in a consistent way.
+- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
 
-The general development workflow with Docker and Bazel is as follows:
+  .. code-block:: bash
 
-1. Get the source code of Paddle:
+     docker images
 
-   .. code-block:: bash
+  to list all images in the system. We can also run
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+  .. code-block:: bash
+		  
+     docker pull paddlepaddle/paddle:0.10.0rc2
 
-   
-   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
+  to download a Docker image, paddlepaddle/paddle in this example,
+  from Dockerhub.com.
 
-   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
-   empty, please use the following command to get the submodule.
+- *container*: considering a Docker image a program, a container is a
+  "process" that runs the image. Indeed, a container is exactly an
+  operating system process, but with a virtualized filesystem, network
+  port space, and other virtualized environment. We can type
 
-   .. code-block:: bash
+  .. code-block:: bash
 
-      git submodule update --init --recursive
+     docker run paddlepaddle/paddle:0.10.0rc2
 
+  to start a container to run a Docker image, paddlepaddle/paddle in this example.
 
-2. Build a development Docker image :code:`paddle:dev` from the source
-   code.  This image contains all the development tools and
-   dependencies of PaddlePaddle.
+- By default docker container have an isolated file system namespace,
+  we can not see the files in the host file system. By using *volume*,
+  mounted files in host will be visible inside docker container.
+  Following command will mount current dirctory into /data inside
+  docker container, run docker container from debian image with
+  command :code:`ls /data`.
 
-   .. code-block:: bash
+  .. code-block:: bash
 
-      cd paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+     docker run --rm -v $(pwd):/data debian ls /data
 
-   Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below.
+Usage of CPU-only and GPU Images
+----------------------------------
 
-   .. code-block:: bash
+We package PaddlePaddle's compile environment into a Docker image,
+called the develop image, it contains all compiling tools that
+PaddlePaddle needs. We package compiled PaddlePaddle program into a
+Docker image as well, called the production image, it contains all
+runtime environment that running PaddlePaddle needs. For each version
+of PaddlePaddle, we release both of them. Production image includes
+CPU-only version and a CUDA GPU version and their no-AVX versions.
 
-      docker build \
-       --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
-       -t paddle:dev \
-       -f paddle/scripts/docker/Dockerfile .
+We put the docker images on `dockerhub.com
+<https://hub.docker.com/r/paddledev/paddle/>`_. You can find the
+latest versions under "tags" tab at dockerhub.com. If you are in
+China, you can use our Docker image registry mirror to speed up the
+download process. To use it, please replace all paddlepaddle/paddle in
+the commands to docker.paddlepaddle.org/paddle.
 
+1. Production images, this image might have multiple variants:
 
-3. Run the image as a container and mounting local source code
-   directory into the container.  This allows us to change the code on
-   the host and build it within the container.
+   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
+   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
+   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
+   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
 
-   .. code-block:: bash
+   Please be aware that the CPU-only and the GPU images both use the
+   AVX instruction set, but old computers produced before 2008 do not
+   support AVX.  The following command checks if your Linux computer
+   supports AVX:
 
-      docker run       \
-       -d              \
-       --name paddle   \
-       -p 2022:22      \
-       -v $PWD:/paddle \
-       paddle:dev
+   .. code-block:: bash
 
-   where :code:`-d` makes the container running in background,
-   :code:`--name paddle` allows us to run a nginx container to serve
-   documents in this container, :code:`-p 2022:22` allows us to SSH
-   into this container, :code:`-v $PWD:/paddle` shares the source code
-   on the host with the container.
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-4. SSH into the container:
+   
+   To run the CPU-only image as an interactive container:
 
    .. code-block:: bash
 
-      ssh root@localhost -p 2022
+      docker run -it --rm paddlepaddle/paddle:0.10.0rc2 /bin/bash
+
+   Above method work with the GPU image too -- the recommended way is
+   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
 
-5. We can edit the source code in the container or on this host.  Then
-   we can build using cmake
+   Please install nvidia-docker first following this `tutorial
+   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
+
+   Now you can run a GPU image:
 
    .. code-block:: bash
 
-      cd /paddle # where paddle source code has been mounted into the container
-      mkdir -p build
-      cd build
-      cmake -DWITH_TESTING=ON ..
-      make -j `nproc`
-      CTEST_OUTPUT_ON_FAILURE=1 ctest
+      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+
+2. development image :code:`paddlepaddle/paddle:<version>-dev`
 
+   This image has packed related develop tools and runtime
+   environment. Users and developers can use this image instead of
+   their own local computer to accomplish development, build,
+   releasing, document writing etc. While different version of paddle
+   may depends on different version of libraries and tools, if you
+   want to setup a local environment, you must pay attention to the
+   versions.  The development image contains:
+   
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+     
+   Many developers use servers with GPUs, they can use ssh to login to
+   the server and run :code:`docker exec` to enter the docker
+   container and start their work.  Also they can start a development
+   docker image with SSHD service, so they can login to the container
+   and start work.
 
-CPU-only and GPU Images
------------------------
 
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically runs the following commands:
+Train Model Using Python API
+----------------------------
 
-.. code-block:: bash
+Our official docker image provides a runtime for PaddlePaddle
+programs. The typical workflow will be as follows:
+
+Create a directory as workspace:
 
-   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
-   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
+.. code-block:: bash
 
+   mkdir ~/workspace
 
-To run the CPU-only image as an interactive container:
+Edit a PaddlePaddle python program using your favourite editor
 
 .. code-block:: bash
 
-    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash
+   emacs ~/workspace/example.py
 
-or, we can run it as a daemon container
+Run the program using docker:
 
 .. code-block:: bash
 
-    docker run -d -p 2202:22 paddledev/paddle:cpu-latest
+   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 python /workspace/example.py
 
-and SSH to this container using password :code:`root`:
+Or if you are using GPU for training:
 
 .. code-block:: bash
 
-    ssh -p 2202 root@localhost
+   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu python /workspace/example.py
+
+Above commands will start a docker container by running :code:`python
+/workspace/example.py`. It will stop once :code:`python
+/workspace/example.py` finishes.
 
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
+Another way is to tell docker to start a :code:`/bin/bash` session and
+run PaddlePaddle program interactively:
 
+.. code-block:: bash
 
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
+   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2 /bin/bash
+   # now we are inside docker container
+   cd /workspace
+   python example.py
+
+Running with GPU is identical:
 
 .. code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
+   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0rc2-gpu /bin/bash
+   # now we are inside docker container
+   cd /workspace
+   python example.py
 
 
-Non-AVX Images
---------------
+Develop PaddlePaddle or Train Model Using C++ API
+---------------------------------------------------
 
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
+We will be using PaddlePaddle development image since it contains all
+compiling tools and dependencies.
 
-.. code-block:: bash
+1. Build PaddlePaddle develop image
+
+   Use following command to build PaddlePaddle develop image:
+
+   .. code-block:: bash
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
+      docker build -t paddle:dev .
 
+2. Build PaddlePaddle production image
 
-If it doesn't, we will need to build non-AVX images manually from
-source code:
+   There are two steps for building production image, the first step is to run:
+
+   .. code-block:: bash
+
+      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+
+   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+
+   The second step is to run:
+
+   .. code-block:: bash
+
+      docker build -t paddle:prod -f build/Dockerfile ./build
+
+   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+
+3. Run unit test
+
+   Following command will run unit test:
+
+   .. code-block:: bash
+      
+      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+
+PaddlePaddle Book
+------------------
+
+The Jupyter Notebook is an open-source web application that allows
+you to create and share documents that contain live code, equations,
+visualizations and explanatory text in a single browser.
+
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
+We already exposed port 8888 for this book. If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+We provide a packaged book image, simply issue the command:
 
 .. code-block:: bash
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   git submodule update --init --recursive
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+    docker run -p 8888:8888 paddlepaddle/book
+
+Then, you would back and paste the address into the local browser:
+
+.. code-block:: text
+
+    http://localhost:8888/
+
+That's all. Enjoy your journey!
 
 
 Documentation
@@ -181,7 +262,7 @@ container:
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --name paddle-cpu-doc paddle:<version>
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 
diff --git a/doc/getstarted/build_and_install/ubuntu_install_cn.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
index d02d9c63bbfb50954d7b75f2c685ce167a3b7146..9e39ccb00f5d5655c30148900a3d76a22aacfc01 100644
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -46,7 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
         with_double: OFF
         with_python: ON
         with_rdma: OFF
-        with_metric_learning:
         with_timer: OFF
         with_predict_sdk:
 
diff --git a/doc/getstarted/concepts/src/train.py b/doc/getstarted/concepts/src/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..679d0a931a7d650108ea89a04080a55d2976f72e
--- /dev/null
+++ b/doc/getstarted/concepts/src/train.py
@@ -0,0 +1,52 @@
+import paddle.v2 as paddle
+import numpy as np
+
+# init paddle
+paddle.init(use_gpu=False)
+
+# network config
+x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+# create parameters
+parameters = paddle.parameters.create(cost)
+# create optimizer
+optimizer = paddle.optimizer.Momentum(momentum=0)
+# create trainer
+trainer = paddle.trainer.SGD(cost=cost,
+                             parameters=parameters,
+                             update_equation=optimizer)
+
+
+# event_handler to print training info
+def event_handler(event):
+    if isinstance(event, paddle.event.EndIteration):
+        if event.batch_id % 1 == 0:
+            print "Pass %d, Batch %d, Cost %f" % (event.pass_id, event.batch_id,
+                                                  event.cost)
+
+
+# define training dataset reader
+def train_reader():
+    train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
+    train_y = np.array([-2, -3, -7, -7])
+
+    def reader():
+        for i in xrange(train_y.shape[0]):
+            yield train_x[i], train_y[i]
+
+    return reader
+
+
+# define feeding map
+feeding = {'x': 0, 'y': 1}
+
+# training
+trainer.train(
+    reader=paddle.batch(
+        train_reader(), batch_size=1),
+    feeding=feeding,
+    event_handler=event_handler,
+    num_passes=100)
diff --git a/doc/getstarted/concepts/use_concepts_cn.rst b/doc/getstarted/concepts/use_concepts_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e63ca11102c8ce457afcc3c262fa5f159361c01d
--- /dev/null
+++ b/doc/getstarted/concepts/use_concepts_cn.rst
@@ -0,0 +1,150 @@
+############
+基本使用概念
+############
+
+PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API，可以轻松地完成神经网络配置，模型训练等任务。
+这里将介绍PaddlePaddle的基本使用概念，并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
+在使用该文档之前，请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
+
+
+配置网络
+============
+
+加载PaddlePaddle
+----------------------
+
+在进行网络配置之前，首先需要加载相应的Python库，并进行初始化操作。
+
+..	code-block:: bash
+
+    import paddle.v2 as paddle
+    import numpy as np
+    paddle.init(use_gpu=False)
+
+
+搭建神经网络
+-----------------------
+
+搭建神经网络就像使用积木搭建宝塔一样。在PaddlePaddle中，layer是我们的积木，而神经网络是我们要搭建的宝塔。我们使用不同的layer进行组合，来搭建神经网络。
+宝塔的底端需要坚实的基座来支撑，同样，神经网络也需要一些特定的layer作为输入接口，来完成网络的训练。
+
+例如，我们可以定义如下layer来描述神经网络的输入：
+
+..	code-block:: bash
+
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(2))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+
+其中x表示输入数据是一个维度为2的稠密向量，y表示输入数据是一个维度为1的稠密向量。
+
+PaddlePaddle支持不同类型的输入数据，主要包括四种类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+
+在定义输入layer之后，我们可以使用其他layer进行组合。在组合时，需要指定layer的输入来源。
+
+例如，我们可以定义如下的layer组合：
+
+..	code-block:: bash
+
+    y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+其中，x与y为之前描述的输入层；而y_predict是接收x作为输入，接上一个全连接层；cost接收y_predict与y作为输入，接上均方误差层。
+
+最后一层cost中记录了神经网络的所有拓扑结构，通过组合不同的layer，我们即可完成神经网络的搭建。
+
+
+训练模型
+============
+
+在完成神经网络的搭建之后，我们首先需要根据神经网络结构来创建所需要优化的parameters，并创建optimizer。
+之后，我们可以创建trainer来对网络进行训练。
+
+..	code-block:: bash
+
+    parameters = paddle.parameters.create(cost)
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+其中，trainer接收三个参数，包括神经网络拓扑结构、神经网络参数以及迭代方程。
+
+在搭建神经网络的过程中，我们仅仅对神经网络的输入进行了描述。而trainer需要读取训练数据进行训练，PaddlePaddle中通过reader来加载数据。
+
+..	code-block:: bash
+
+    # define training dataset reader
+    def train_reader():
+        train_x = np.array([[1, 1], [1, 2], [3, 4], [5, 2]])
+        train_y = np.array([-2, -3, -7, -7])
+        def reader():
+            for i in xrange(train_y.shape[0]):
+                yield train_x[i], train_y[i]
+        return reader
+
+最终我们可以调用trainer的train方法启动训练：
+
+..	code-block:: bash
+
+    # define feeding map
+    feeding = {'x': 0, 'y': 1}
+
+    # event_handler to print training info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+    # training
+    trainer.train(
+        reader=paddle.batch(train_reader(), batch_size=1),
+        feeding=feeding,
+        event_handler=event_handler,
+        num_passes=100)
+
+关于PaddlePaddle的更多使用方法请参考 `进阶指南 <../../howto/index_cn.html>`_。
+
+线性回归完整示例
+==============
+
+下面给出在三维空间中使用线性回归拟合一条直线的例子：
+
+..  literalinclude:: src/train.py
+    :linenos:
+
+有关线性回归的实际应用，可以参考PaddlePaddle book的 `第一章节 <http://book.paddlepaddle.org/index.html>`_。
\ No newline at end of file
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index c6a4d3121c5857cd434acecb389d68f4d4c7a532..0cb27f802c40ef123fdc9c6799aad3b2a5f554c0 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -2,7 +2,9 @@
 ============
 
 ..  toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   build_and_install/index_cn.rst
-  basic_usage/index_cn.rst
+  concepts/use_concepts_cn.rst
+
+- `深度学习入门课程 <http://book.paddlepaddle.org/>`_
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 55d95d8015e56ddae3363d19315db0fad841caad..9f771e93e8b63eb98e31ec12667bd1aa007af20e 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -2,7 +2,8 @@ GET STARTED
 ============
 
 ..  toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   build_and_install/index_en.rst
-  basic_usage/index_en.rst
+
+- `Deep Learning 101 <http://book.paddlepaddle.org/index.en.html>`_
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 943b1d4bb84646d9f60de7790be166a83d10b1e0..79048e92482851af6c2dd7d055868ebcaa7a298b 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -19,20 +19,20 @@
 
 在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
 
-pooling_layer
-==============
+pooling
+========
 
-pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_pooling_layer` 配置API。
+pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API。
 
 ..	code-block:: bash
 
-        seq_pool = pooling_layer(input=layer,
-                                 pooling_type=AvgPooling(),
-                                 agg_level=AggregateLevel.EACH_SEQUENCE)
+        seq_pool = pooling(input=layer,
+                           pooling_type=pooling.Max(),
+                           agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
 
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
@@ -47,14 +47,14 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
 last_seq 和 first_seq
 =====================
 
-last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_seq` 类似），详细见 :ref:`api_trainer_config_helpers_layers_last_seq` 配置API。
+last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详细见 :ref:`api_v2.layer_last_seq` 配置API。
 
 ..	code-block:: bash
 
         last = last_seq(input=layer,
                         agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
@@ -65,16 +65,16 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
 
-expand_layer
-============
+expand
+======
 
-expand_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_expand_layer` 配置API。
+expand 的使用示例如下，详细见 :ref:`api_v2.layer_expand` 配置API。
 
 ..	code-block:: bash
 
-        expand = expand_layer(input=layer1,
-                              expand_as=layer2,
-                              expand_level=ExpandLevel.FROM_TIMESTEP)
+        ex = expand(input=layer1,
+                    expand_as=layer2,
+                    expand_level=ExpandLevel.FROM_TIMESTEP)
         
 - `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
 
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9ecab5594cff47cde4700b7ce0f58013a960a16e..9e805ca85191b793c8798a239927a318c70b96f5 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,7 +4,6 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
-  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
index 7adc79873d699fdfd5a85034bcef964dd1f19132..13a153b05c578e0af82ee29db5ea27fd4b6d6f59 100644
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -1,7 +1,2 @@
 RNN Models
 ==========
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn_config_en.rst
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index e0a63f5a14c7b2e8953aa21739668ee2a9ebeff1..775938612e8d213b92e2eb69dae805838dc5ae96 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -1,131 +1,219 @@
 # 如何贡献代码
 
 我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
- 
+
 ## 代码要求
-- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
-- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 代码注释请遵守 [Doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 `WITH_STYLE_CHECK` 已打开，并且编译能通过代码样式检查。
 - 所有代码必须具有单元测试。
 - 通过所有单元测试。
 
 以下教程将指导您提交代码。
- 
 ## [Fork](https://help.github.com/articles/fork-a-repo/)
- 
-跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮，生成自己目录下的仓库，比如 <https://github.com/USERNAME/Paddle>。
 
 ## 克隆（Clone）
 
-Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
-**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+将远程仓库 clone 到本地：
+
+```bash
+➜  git clone https://github.com/USERNAME/Paddle
+➜  cd Paddle
+```
+
+
+## 创建本地分支
+
+Paddle 目前使用[Git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护，具体请参考 [Paddle 分支规范](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/releasing_process.md#paddle-分支规范)。
 
-一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+所有的 feature 和 bug fix 的开发工作都应该在一个新的分支上完成，一般从 `develop` 分支上创建新分支。
 
-```shell
-# 克隆 fork 到本地
-git clone --branch develop https://github.com/USERNAME/Paddle.git
+使用 `git checkout -b` 创建并切换到新分支。
+
+```bash
+➜  git checkout -b my-cool-stuff
 ```
-如果你的仓库不包含 **develop** 分支，你只需自己创建它。
 
-```shell
-git clone https://github.com/USERNAME/Paddle.git Paddle
-cd Paddle
-git checkout -b develop  # 创建 develop 分支
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
-git pull upstream develop  # 更新 upstream
-git submodule update --init --recursive
+值得注意的是，在 checkout 之前，需要保持当前分支目录 clean，否则会把 untracked 的文件也带到新分支上，这可以通过 `git status` 查看。
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理 Git 预提交钩子。 它可以帮助我们格式化源代码（C++，Python），在提交（commit）前自动检查一些基本事宜（如每个文件只有一个 EOL，Git 中不要添加大文件等）。
+
+`pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子的 PR 不能被提交到 Paddle，首先安装并在当前目录运行它：
+
+```bash
+➜  pip install pre-commit
+➜  pre-commit install
 ```
 
-然后你可以通过做一个本地开发分支开始开发
+Paddle 使用 `clang-format` 来调整 C/C++ 源代码格式，请确保 `clang-format` 版本在 3.8 以上。
 
-```shell
-git checkout -b MY_COOL_STUFF_BRANCH
+## 开始开发
+
+在本例中，我删除了 README.md 中的一行，并创建了一个新文件。
+
+通过 `git status` 查看当前状态，这会提示当前目录的一些变化，同时也可以通过 `git diff` 查看文件具体被修改的内容。
+
+```bash
+➜  git status
+On branch test
+Changes not staged for commit:
+  (use "git add <file>..." to update what will be committed)
+  (use "git checkout -- <file>..." to discard changes in working directory)
+
+	modified:   README.md
+
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+no changes added to commit (use "git add" and/or "git commit -a")
 ```
 
-## 使用 `pre-commit` 钩子
+## 构建和测试
+
+编译 PaddlePaddle 的源码以及生成文档需要多种开发工具。为了方便大家，我们的标准开发流程是把这些工具都装进一个Docker image，称为*开发镜像*，通常名字是 `paddle:dev`。然后所有用 `cmake && make` 的地方（比如IDE配置里）都用 `docker run paddle:dev`来代替。
 
-Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
-，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
-的 PR 不能提交代码到 Paddle。
+如要build这个开发镜像，在源码目录树的根目录中运行：
 
-你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
-目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+```bash
+➜  docker build -t paddle:dev .
+```
 
-然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
-提交你的代码时，pre-commit 钩子会检查本地代码是否存在
-不适合提交的东西，等等。
+随后可以用这个开发镜像开build PaddlePaddle的源码。比如如果要build一个不依赖GPU，但是支持AVX指令集，并且包括unit tests的PaddlePaddle，可以：
 
-## 提交（Commit）
+```bash
+➜  docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" paddle:dev
+```
 
-提交你的代码：
+这个过程除了编译PaddlePaddle为 `./build/libpaddle.so`，并且输出一个 `./build/paddle.deb`文件之外，还会输出一个 `build/Dockerfile`。我们只需要运行下面命令把编译好的PaddlePaddle打包成一个*生产镜像*（`paddle:prod`）：
 
-```shell
-# 显示工作树状态
-git status
-# 添加修改过的文件
-git add xx
-env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```bash
+➜  docker build -t paddle:prod -f build/Dockerfile .
 ```
-提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
 
-## 保持 Fork 状态最新
+如果要运行所有的单元测试，可以用如下命令：
 
-在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
-为此，你需要首先添加远程（remote）：
+```bash
+➜  docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+```
 
-```shell
-# 观察当前远程仓库配置
-git remote -v
-# 添加上游（upstream）仓库
-git remote add upstream https://github.com/PaddlePaddle/Paddle.git
-# 验证新的 upstream
-git remote -v
+关于构建和测试的更多信息，请参见[这篇文档](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/getstarted/build_and_install/docker_install_cn.rst)。
+
+## 提交（commit）
+
+接下来我们取消对 README.md 文件的改变，然后提交新添加的 test 文件。
+
+```bash
+➜  git checkout -- README.md
+➜  git status
+On branch test
+Untracked files:
+  (use "git add <file>..." to include in what will be committed)
+
+	test
+
+nothing added to commit but untracked files present (use "git add" to track)
+➜  git add test
+```
+
+Git 每次提交代码，都需要写提交说明，这可以让其他人知道这次提交做了哪些改变，这可以通过`git commit` 完成。
+
+```bash
+➜  git commit
+CRLF end-lines remover...............................(no files to check)Skipped
+yapf.................................................(no files to check)Skipped
+Check for added large files..............................................Passed
+Check for merge conflicts................................................Passed
+Check for broken symlinks................................................Passed
+Detect Private Key...................................(no files to check)Skipped
+Fix End of Files.....................................(no files to check)Skipped
+clang-formater.......................................(no files to check)Skipped
+[my-cool-stuff c703c041] add test file
+ 1 file changed, 0 insertions(+), 0 deletions(-)
+ create mode 100644 233
+```
+
+## 保持本地仓库最新
+
+在准备发起 Pull Request 之前，需要同步原仓库（<https://github.com/PaddlePaddle/Paddle>）最新的代码。
+
+首先通过 `git remote` 查看当前远程仓库的名字。
+
+```bash
+➜  git remote
+origin
+➜  git remote -v
+origin	https://github.com/USERNAME/Paddle (fetch)
+origin	https://github.com/USERNAME/Paddle (push)
 ```
 
-用最新的 upstream 更新你的 fork：
+这里 origin 是我们 clone 的远程仓库的名字，也就是自己用户名下的 Paddle，接下来我们创建一个原始 Paddle 仓库的远程主机，命名为 upstream。
 
-```shell
-git pull --rebase upstream develop
+```bash
+➜  git remote add upstream https://github.com/PaddlePaddle/Paddle
+➜  git remote
+origin
+upstream
 ```
-如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
 
-现在，你的本地主分支与上游修改的一致并是最新的。
+获取 upstream 的最新代码并更新当前分支。
 
-## 推送（Push）到 GitHub
+```bash
+➜  git fetch upstream
+➜  git pull upstream develop
+```
+
+## Push 到远程仓库
+
+将本地的修改推送到 GitHub 上，也就是 https://github.com/USERNAME/Paddle。
 
-```shell
-# 在 GitHub 上 push 你的仓库
-git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```bash
+# 推送到远程仓库 origin 的 my-cool-stuff 分支上
+➜  git push origin my-cool-stuff
 ```
 
-## 拉取请求（Pull Request）
+## 建立 Issue 并完成 Pull Request
+
+建立一个 Issue 描述问题，并记录它的编号。
+
+切换到所建分支，然后点击 `New pull request`。
+
+<img width="295" alt="screen shot 2017-04-26 at 9 09 28 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436054/a6d98c66-2ac4-11e7-9cb1-18dd13150230.png">
 
-转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+选择目标分支：
 
-## 使用最新版本更新你的 pull 请求
+<img width="750" alt="screen shot 2017-04-26 at 9 11 52 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436139/f83b1e6c-2ac4-11e7-8c0e-add499023c46.png">
 
-在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+在 PR 的描述说明中，填写 `resolve #Issue编号` 可以在这个 PR 被 merge 后，自动关闭对应的 Issue，具体请见 <https://help.github.com/articles/closing-issues-via-commit-messages/>。
 
-```shell
-git checkout MY_COOL_STUFF_BRANCH
-git pull upstream develop
-# 你可能需要根据git提示解决冲突
-# 创建并测试你的代码
-git push origin MY_COOL_STUFF_BRANCH
+接下来等待 review，如果有需要修改的地方，参照上述步骤更新 origin 中的对应分支即可。
+
+## 删除远程分支
+
+在 PR 被 merge 进主仓库后，我们可以在 PR 的页面删除远程仓库的分支。
+
+<img width="775" alt="screen shot 2017-04-26 at 9 18 24 pm" src="https://cloud.githubusercontent.com/assets/11692045/25436457/e4cdd472-2ac5-11e7-9272-badc76c4a23e.png">
+
+也可以使用 `git push origin :分支名` 删除远程分支，如：
+
+```bash
+➜  git push origin :my-cool-stuff
 ```
-现在你的 Pull Request 是最新的了。
 
-## 修改你的 pull request
+## 删除本地分支
 
-当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+最后，删除本地分支。
 
-可能的命令是
+```bash
+# 切换到 develop 分支
+➜  git checkout develop 
 
-```shell
-git checkout MY_COOL_STUFF_BRANCH
-git pull upstream develop   # 将本地更新到最新的代码库
-# 可能会发生一些冲突
-# 开始开发吧！
-env EDITOR=vim git commit  # 添加修改日志
-git push origin MY_COOL_STUFF_BRANCH
+# 删除 my-cool-stuff 分支
+➜  git branch -D my-cool-stuff
 ```
+
+至此，我们就完成了一次代码贡献的过程。
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
index e578f6fce8b94180da7d5de041a0e17b1d59f6ea..9b0d3e83c0dc264650eda73e6801c60a75439b4a 100644
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -38,7 +38,6 @@ cd Paddle
 git checkout -b develop  # create develop branch.
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
-git submodule update --init --recursive
 ```
 
 Then you can start to develop by making a local developement branch
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index 5051a892304fdc8b0f1a19a7d4560d5ee007c47d..d536f53abc031e9d279ace0e231a381a2f1e81b6 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -8,7 +8,8 @@ PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两
 如何构建PaddlePaddle的文档
 ==========================
 
-PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。构建PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
+PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式，我们提供了一个构建脚本build_docs.sh来进行构建。
+PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使用基于Docker来构建PaddlePaddle的文档。
 
 
 使用Docker构建PaddlePaddle的文档
@@ -16,39 +17,62 @@ PaddlePaddle的文档构建有直接构建和基于Docker构建两种方式。
 
 使用Docker构建PaddlePaddle的文档，需要在系统里先安装好Docker工具包。Docker安装请参考 `Docker的官网 <https://docs.docker.com/>`_ 。安装好Docker之后可以使用源码目录下的脚本构建文档，即
 
-..	code-block:: bash
+..  code-block:: bash
 
-	cd TO_YOUR_PADDLE_CLONE_PATH
-	cd paddle/scripts/tools/build_docs
-	bash build_docs.sh
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    bash build_docs.sh with_docker
 
-编译完成后，该目录下会生成如下两个子目录\:
+编译完成后，会在当前目录生成两个子目录\:
 
 * doc 英文文档目录
 * doc_cn 中文文档目录
 
 打开浏览器访问对应目录下的index.html即可访问本地文档。
 
-..	code-block:: bash
-
-	open doc_cn/index.html
 
 
 直接构建PaddlePaddle的文档
 --------------------------
 
-TBD
+因为PaddlePaddle的v2 api文档生成过程依赖于py_paddle Python包，用户需要首先确认py_paddle包已经安装。
+
+..  code-block:: bash
+
+    python -c "import py_paddle"
+
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
+
+如果提示正确，可以执行以下命令编译生成文档，即
+
+..  code-block:: bash
+
+    cd TO_YOUR_PADDLE_CLONE_PATH
+    cd paddle/scripts/tools/build_docs
+    bash build_docs.sh local
+
+编译完成之后，会在当前目录生成两个子目录\:
+
+* doc 英文文档目录
+* doc_cn 中文文档目录
+
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
 
 如何书写PaddlePaddle的文档
 ==========================
 
-TBD
+PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
 
 如何更新www.paddlepaddle.org文档
 ================================
 
-TBD
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+
 
 
-..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index bd3d0ec292057037414792b1ac176d12605b90d5..26449a6365843b526b3ac3111b337d2f17524c9d 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -8,8 +8,8 @@
   :maxdepth: 1
 
   usage/cmd_parameter/index_cn.rst
-  usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
+  usage/k8s/k8s_basis_cn.md
   usage/k8s/k8s_cn.md
   usage/k8s/k8s_distributed_cn.md
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index acdcfa1c0047ced85c0a9c53d691edc0b4489336..274452fbf0c595ad7b4dbeffe85ad9038f12b458 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -6,7 +6,7 @@
 
 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
 
 ## 前提条件
 
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 30963dcd927250651f3ed0b39949f541cc28ed4a..c60876721cbf5565d6e48c8061811aacada748cd 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -2,7 +2,7 @@
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
 
 ## Prerequisite
 
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index 833e21dd19ef3c01f5ef990bd12c3fc3b41ba483..f7aa525054468670f59309ddf9206af55bb77869 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -127,11 +127,6 @@
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
@@ -233,16 +228,6 @@
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
index 013edbc9047817d7f6b82c4d5188412bd2ce41d6..d1963067bda949b11ececefed3db7db1432c6223 100644
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
@@ -233,16 +228,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">metric learning</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
index dbf7c6f00b8ba5c62d86fb2143221a27330b9506..b4625ba68cf23e5697554ba94efaf0b873f2c1de 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -180,15 +180,6 @@
   - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
   - 类型: string (默认: "", null).
 
-## 度量学习(Metric Learning)
-* `--external`
-   - 指示是否使用外部机器进行度量学习.
-   - 类型: bool (默认: 0).
-
-* `--data_server_port`
-  - 数据服务器(data server)的监听端口，主要用在度量学习中.
-  - 类型: int32 (默认: 21134).
-
 ## 数据支持(DataProvider)
 
 * `--memory_threshold_on_load_data`
@@ -306,10 +297,6 @@
   - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
   - 类型: bool (默认: 0).
 
-* `--allow_inefficient_sparse_update`
-  - 指示是否允许低效率的稀疏更新.
-  - 类型: bool (默认: 0).
-
 * `--check_sparse_distribution_batches`
   - 每运行多少个批次执行一次稀疏参数分布的检查.
   - 类型: int32 (默认: 100).
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index aa69a3bd5423c4f3223242bdafda251271925f2d..b681ebc81a355dfc1a7638a4463dff6979929a45 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -184,15 +184,6 @@
   - Specify shared dynamic library. It can be defined out of paddle by user.
   - type: string (default: "", null).
 
-## Metric Learning
-* `--external`
-   - Whether to use external machine for metric learning.
-   - type: bool (default: 0).
-
-* `--data_server_port`
-  - Listening port for dserver (data server), dserver is mainly used in metric learning.
-  - type: int32 (default: 21134).
-
 ## DataProvider
 
 * `--memory_threshold_on_load_data`
@@ -310,10 +301,6 @@
   - show log details for sparse parameter distribution in pserver.
   - type: bool (default: 0).
 
-* `--allow_inefficient_sparse_update`
-  - Whether to allow inefficient sparse update.
-  - type: bool (default: 0).
-
 * `--check_sparse_distribution_batches`
   - Running sparse parameter distribution check every so many batches.
   - type: int32 (default: 100).
diff --git a/doc/howto/usage/concepts/src/pserver_topology.dot b/doc/howto/usage/concepts/src/pserver_topology.dot
deleted file mode 100644
index 9ff658b8495030f322d4f553f3bf72ddf8d3a578..0000000000000000000000000000000000000000
--- a/doc/howto/usage/concepts/src/pserver_topology.dot
+++ /dev/null
@@ -1,68 +0,0 @@
-graph pp_topology {
-	rankdir=BT;
-	subgraph cluster_node0 {
-		style=filled;
-		color=lightgrey;
-		node [style=filled, color=white, shape=box];
-		label = "机器0"
-
-		pserver0 [label="Parameter \n Server 0"]
-		trainer0 [label="Trainer 0"]
-	}
-	subgraph cluster_node1 {
-		style=filled;
-		color=lightgrey;
-		node [style=filled, color=white, shape=box];
-		label = "机器1"
-
-		pserver1 [label="Parameter \n Server 1"]
-		trainer1 [label="Trainer 1"]
-	}
-
-	subgraph cluster_node2 {
-		style=filled;
-		color=lightgrey;
-		node [style=filled, color=white, shape=box];
-		label = "机器2"
-
-		pserver2 [label="Parameter \n Server 2"]
-		trainer2 [label="Trainer 2"]
-	}
-
-	subgraph cluster_node3 {
-		style=filled;
-		color=lightgrey;
-		node [style=filled, color=white, shape=box];
-		label = "机器3"
-
-		pserver3 [label="Parameter \n Server 3"]
-		trainer3 [label="Trainer 3"]
-	}
-
-	data [label="数据", shape=hexagon]
-
-	trainer0 -- pserver0
-	trainer0 -- pserver1
-	trainer0 -- pserver2
-	trainer0 -- pserver3
-
-	trainer1 -- pserver0
-	trainer1 -- pserver1
-	trainer1 -- pserver2
-	trainer1 -- pserver3
-
-	trainer2 -- pserver0
-	trainer2 -- pserver1
-	trainer2 -- pserver2
-	trainer2 -- pserver3
-
-	trainer3 -- pserver0
-	trainer3 -- pserver1
-	trainer3 -- pserver2
-	trainer3 -- pserver3
-
-	data -- trainer0
-	data -- trainer1
-	data -- trainer2
-	data -- trainer3
-}
diff --git a/doc/howto/usage/concepts/src/trainer_config.py b/doc/howto/usage/concepts/src/trainer_config.py
deleted file mode 100644
index 3eccbd7bc11f4865130286de718d1be74e4d1722..0000000000000000000000000000000000000000
--- a/doc/howto/usage/concepts/src/trainer_config.py
+++ /dev/null
@@ -1,29 +0,0 @@
-from paddle.trainer_config_helpers import *
-
-define_py_data_sources2(
-    train_list='train.list',
-    test_list='test.list',
-    module='provider',
-    obj='process')
-settings(
-    batch_size=128,
-    learning_rate=1e-3,
-    learning_method=AdamOptimizer(),
-    regularization=L2Regularization(0.5))
-
-img = data_layer(name='pixel', size=28 * 28)
-
-hidden1 = simple_img_conv_pool(
-    input=img, filter_size=3, num_filters=32, pool_size=3, num_channel=1)
-
-hidden2 = fc_layer(
-    input=hidden1,
-    size=200,
-    act=TanhActivation(),
-    layer_attr=ExtraAttr(drop_rate=0.5))
-predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-
-outputs(
-    classification_cost(
-        input=predict, label=data_layer(
-            name='label', size=10)))
diff --git a/doc/howto/usage/concepts/use_concepts_cn.rst b/doc/howto/usage/concepts/use_concepts_cn.rst
deleted file mode 100644
index fa334bcbb9e29d6943def7c35fa53e1b9262d29c..0000000000000000000000000000000000000000
--- a/doc/howto/usage/concepts/use_concepts_cn.rst
+++ /dev/null
@@ -1,139 +0,0 @@
-############
-基本使用概念
-############
-
-PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
-
-单机模式用命令 ``paddle train`` 可以启动一个trainer进程，单机训练通常只包括一个trainer进程。如果数据规模比较大，希望加速训练，可以启动分布式作业。一个分布式作业里包括若干trainer进程和若干Parameter Server（或称pserver）进程。用命令 ``paddle pserver`` 可以启动 pserver 进程，pserver进程用于协调多个trainer进程之间的通信。
-
-本文首先介绍trainer进程中的一些使用概念，然后介绍pserver进程中概念。
-
-..    contents::
-
-系统框图
-========
-
-下图描述了用户使用框图，PaddlePaddle的trainer进程里内嵌了Python解释器，trainer进程可以利用这个解释器执行Python脚本，Python脚本里定义了模型配置、训练算法、以及数据读取函数。其中，数据读取程序往往定义在一个单独Python脚本文件里，被称为数据提供器（DataProvider），通常是一个Python函数。模型配置、训练算法通常定义在另一单独Python文件中, 称为训练配置文件。下面将分别介绍这两部分。
-
-..    graphviz:: 
-
-    digraph pp_process {
-        rankdir=LR;
-        config_file [label="用户神经网络配置"];
-        subgraph cluster_pp {
-            style=filled;
-            color=lightgrey;
-            node [style=filled, color=white, shape=box];
-            label = "PaddlePaddle C++";
-            py [label="Python解释器"];
-        }
-        data_provider [label="用户数据解析"];
-        config_file -> py;
-        py -> data_provider [dir="back"];
-    }
-
-数据提供器
-==========
-
-DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据转换成系统可以识别的数据类型。每当系统需要新的数据训练时, trainer进程会调用DataProvider函数返回数据。当所有数据读取完一轮后，DataProvider返回空数据，通知系统一轮数据读取结束，并且系统每一轮训练开始时会重置DataProvider。需要注意的是，DataProvider是被系统调用，而不是新数据驱动系统，一些随机化噪声添加都应该在DataProvider中完成。
-
-在不同的应用里，训练数据的格式往往各不相同。因此，为了用户能够灵活的处理数据，我们提供了Python处理数据的接口，称为 ``PyDataProvider`` 。在 ``PyDataProvider`` 中，系统C++模块接管了shuffle、处理batch、GPU和CPU通信、双缓冲、异步读取等问题，一些情况下(如：``min_pool_size=0``)需要Python接口里处理shuffle，可以参考 :ref:`api_pydataprovider2` 继续深入了解。
-
-
-训练配置文件
-============
-
-训练配置文件主要包括数据源、优化算法、网络结构配置三部分。 其中数据源配置与DataProvider的关系是：DataProvider里定义数据读取函数，训练配置文件的数据源配置中指定DataProvider文件名字、生成数据函数接口，请不要混淆。
-
-一个简单的训练配置文件为：
-
-..  literalinclude:: src/trainer_config.py
-    :linenos:
-
-文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是 :ref:`api_trainer_config` 的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。
-
-下面分别介绍数据源配置、优化算法配置、网络结构配置这三部分该概念。
-
-数据源配置
-----------
-
-使用 ``PyDataProvider2`` 的函数 ``define_py_data_sources2`` 配置数据源。``define_py_data_sources2`` 里通过train_list和test_list指定是训练文件列表和测试文件列表。 如果传入字符串的话，是指一个数据列表文件。这个数据列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个list文件，再传入给train.list或者test.list。
-
-``module`` 和 ``obj`` 指定了DataProvider的文件名和返回数据的函数名。更详细的使用，请参考 :ref:`api_pydataprovider2` 。
-
-优化算法配置
-------------
-
-通过 :ref:`api_trainer_config_helpers_optimizers_settings` 接口设置神经网络所使用的训练参数和 :ref:`api_trainer_config_helpers_optimizers` ，包括学习率、batch_size、优化算法、正则方法等，具体的使用方法请参考 :ref:`api_trainer_config_helpers_optimizers_settings` 文档。
-
-网络结构配置
-------------
-
-神经网络配置主要包括网络连接、激活函数、损失函数、评估器。
-
-- 网络连接： 主要由Layer组成，每个Layer返回的都是一个 ``LayerOutput`` 对象，Layer里面可以定义参数属性、激活类型等。
-
-  为了更灵活的配置，PaddlePaddle提供了基于 Projection 或者 Operator 的配置，这两个需要与 ``mixed_layer`` 配合使用。这里简单介绍Layer、Projection、Operator的概念:
-
-  - Layer: 神经网络的某一层，可以有可学习的参数，一般是封装了许多复杂操作的集合。
-  - Projection：需要与 ``mixed_layer`` 配合使用，含可学习参数。
-  - Operator： 需要与 ``mixed_layer`` 配合使用，不含可学习参数，输入全是其他Layer的输出。
-
- 
-  这个配置文件网络由 ``data_layer`` 、 ``simple_img_conv_pool`` 、 ``fc_layer`` 组成。
-
-  - :ref:`api_trainer_config_helpers_layers_data_layer`  ： 通常每个配置文件都会包括 ``data_layer`` ，定义输入数据大小。
-  - :ref:`api_trainer_config_helpers_network_simple_img_conv_pool` ：是一个组合层，包括了图像的卷积 (convolution)和池化(pooling)。
-  - :ref:`api_trainer_config_helpers_layers_fc_layer` ：全连接层，激活函数为Softmax，这里也可叫分类层。
-
-- 损失函数和评估器：损失函数即为网络的优化目标，评估器可以评价模型结果。
-
-  PaddlePaddle包括很多损失函数和评估起，详细可以参考 :ref:`api_trainer_config_helpers_layers_cost_layers` 和 :ref:`api_trainer_config_helpers_evaluators` 。这里 ``classification_cost`` 默认使用多类交叉熵损失函数和分类错误率统计评估器。
-  
-- ``outputs``: 标记网络输出的函数为 ``outputs`` 。
-
-  训练阶段，网络的输出为神经网络的优化目标；预测阶段，网络的输出也可通过 ``outputs`` 标记。
-
-
-这里对 ``mixed_layer`` 稍做详细说明， 该Layer将多个输入(Projection 或 Operator)累加求和，具体计算是通过内部的 Projection 和 Operator 完成，然后加 Bias 和 activation 操作，
-
-例如，和 ``fc_layer`` 同样功能的 ``mixed_layer`` 是:
-
-..    code-block:: python
-   
-       data = data_layer(name='data', size=200)
-       with mixed_layer(size=200) as out:
-           out += full_matrix_projection(input=data)
-
-PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 :ref:`api_trainer_config_helpers_layers_mixed_layer` 的相关文档进行配置。
-
-
-分布式训练
-==========
-
-PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trainer 进行同步。多机训练的经典拓扑结构如下\:
-
-..    graphviz:: src/pserver_topology.dot
-
-图中每个灰色方块是一台机器，在每个机器中，先使用命令 ``paddle pserver`` 启动一个pserver进程，并指定端口号，可能的参数是\:
-
-..    code-block:: bash
-
-    paddle pserver --port=5000 --num_gradient_servers=4 --tcp_rdma='tcp' --nics='eth0'
-
-* ``--port=5000`` : 指定 pserver 进程端口是 5000 。
-* ``--gradient_servers=4`` : 有四个训练进程(PaddlePaddle 将 trainer 也称作 GradientServer ，因为其为负责提供Gradient) 。
-* ``--tcp_rdma='tcp' --nics=`eth0```: 指定以太网类型为TCP网络，指定网络接口名字为eth0。
-
-启动之后 pserver 进程之后，需要启动 trainer 训练进程，在各个机器上运行如下命令\:
-
-..    code-block:: bash
-
-    paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
-
-对于简单的多机协同训练使用上述方式即可。另外，pserver/train 通常在高级情况下，还需要设置下面两个参数\：
-
-* --ports_num\: 一个 pserver 进程共绑定多少个端口用来做稠密更新，默认是1。
-* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0。
-
-使用手工指定端口数量，是因为Paddle的网络通信中，使用了 int32 作为消息长度，比较容易在大模型下溢出。所以，在 pserver 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，对性能尤其是内存占用有一定的开销，另外稀疏更新的端口如果太大的话，很容易导致某一个参数服务器没有分配到任何参数。
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index b04bfba590de42956dfe99256cde325b24adbfab..ce72b0803818d5bf0c18753c421848cf2fc1b668 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -1,26 +1,67 @@
-# Kubernetes on AWS
 
-## Create AWS Account and IAM Account
+# Distributed PaddlePaddle Training on AWS with Kubernetes
 
-To use AWS, we need to sign up an AWS account on Amazon's Web site.
-An AWS account allows us to login to the AWS Console Web interface to
-create IAM users and user groups. Usually, we create a user group with
-privileges required to run PaddlePaddle, and we create users for
-those who are going to run PaddlePaddle and add these users into the
-group. IAM users can identify themselves using password and tokens,
-where passwords allows users to log in to the AWS Console, and tokens
-make it easy for users to submit and inspect jobs from the command
-line.
+We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
+
+## Distributed PaddlePaddle Training Core Concepts
+
+### Distributed Training Job
+
+A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
+
+Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables.
+
+In a distributed training job, we would:
+
+1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
+
+### Parameter Servers and Trainers
+
+There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
+
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
+
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved.
+
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
+
+### Trainer ID
+
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
+
+### Training
+
+The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
+
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
+1. Copy the training data from EFS persistent volume into container.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
+
+
+## PaddlePaddle on AWS with Kubernetes
+
+### Choose AWS Service Region
+This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link
+https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/
+Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3.
+In this tutorial, we use "Oregon(us-west-2)" as example.
+
+### Create AWS Account and IAM Account
+
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
 
 To sign up an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
-To create users and user groups under an AWS account, please
+To create IAM users and user groups under an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
 
-Please be aware that this tutorial needs the following privileges in
-the user group:
+Please be aware that this tutorial needs the following privileges for the user in IAM:
 
 - AmazonEC2FullAccess
 - AmazonS3FullAccess
@@ -31,25 +72,16 @@ the user group:
 - IAMUserSSHKeys
 - IAMFullAccess
 - NetworkAdministrator
+- AWSKeyManagementServicePowerUser
 
 
-By the time we write this tutorial, we noticed that Chinese AWS users
-might suffer from authentication problems when running this tutorial.
-Our solution is that we create a VM instance with the default Amazon
-AMI and in the same zone as our cluster runs, so we can SSH to this VM
-instance as a tunneling server and control our cluster and jobs from
-it.
-
-
-## PaddlePaddle on AWS
-
-Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
-
-
-###Download kube-aws and kubectl
+### Download kube-aws and kubectl
 
-####kube-aws
+#### kube-aws
 
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
+##### Verify kube-aws integrity
+Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp.
 Import the CoreOS Application Signing Public Key:
 
 ```
@@ -63,7 +95,7 @@ gpg2 --fingerprint FC8A365E
 ```
 The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
 
-Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
 
 Validate the tarball's GPG signature:
 
@@ -74,7 +106,7 @@ PLATFORM=darwin-amd64
 
 gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
 ```
-
+##### Install kube-aws
 Extract the binary:
 
 ```
@@ -88,34 +120,39 @@ mv ${PLATFORM}/kube-aws /usr/local/bin
 ```
 
 
-####kubectl
+#### kubectl
 
-Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
 
-Extract the tarball and then concate the kubernetes binaries directory into PATH:
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
 
 ```
-export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
 
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
 ```
 
-User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
-
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
 
-###Configure AWS Credentials
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
 
-First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+### Configure AWS Credentials
 
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
 
 And then configure your AWS account information:
 
 ```
 aws configure
-
 ```
 
 
-Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+Fill in the required fields:
 
 
 ```
@@ -123,36 +160,44 @@ AWS Access Key ID: YOUR_ACCESS_KEY_ID
 AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
 Default region name: us-west-2
 Default output format: json
-
 ```
 
-Test that your credentials work by describing any instances you may already have running on your account:
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+Verify that your credentials work by describing any instances you may already have running on your account:
 
 ```
 aws ec2 describe-instances
 ```
 
-###Define Cluster Parameters
+### Define Cluster Parameters
 
-####EC2 key pair
+#### EC2 key pair
 
 The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
 
-After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
+
+After creating a key pair, you will use the key pair name to configure the cluster.
+
+Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon).
 
-####KMS key
+Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later.
+
+
+#### KMS key
 
 Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
 
-You can create a KMS key in the AWS console, or with the aws command line tool:
+You can create a KMS key with the aws command line tool:
 
 ```
-$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
 {
     "KeyMetadata": {
         "CreationDate": 1458235139.724,
         "KeyState": "Enabled",
-        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
         "AWSAccountId": "xxxxxxxxxxxxx",
         "Enabled": true,
         "KeyUsage": "ENCRYPT_DECRYPT",
@@ -162,14 +207,16 @@ $ aws kms --region=us-west-2 create-key --description="kube-aws assets"
 }
 ```
 
-You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+We will need to use the value of `Arn` later.
+
+And then let's add several inline policies in your IAM user permission.
 
-And then you need to add several inline policies in your user permission.
+Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`.
 
-kms inline policy:
+Paste into following inline policies:
 
 ```
-{
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
     "Version": "2012-10-17",
     "Statement": [
         {
@@ -180,18 +227,10 @@ kms inline policy:
                 "kms:Encrypt"
             ],
             "Resource": [
-                "arn:aws:kms:*:xxxxxxxxx:key/*"
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
             ]
-        }
-    ]
-}
-```
-cloudformation inline policy:
-
-```
-"Version": "2012-10-17",
-    "Statement": [
-        {
+        },
+		{
             "Sid": "Stmt1482205746000",
             "Effect": "Allow",
             "Action": [
@@ -200,26 +239,44 @@ cloudformation inline policy:
                 "cloudformation:DeleteStack",
                 "cloudformation:DescribeStacks",
                 "cloudformation:DescribeStackResource",
-                "cloudformation:GetTemplate"
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
             ],
             "Resource": [
-                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
             ]
         }
     ]
 }
 ```
+`Version` : Its value has to be exactly "2012-10-17".
+`AWS_ACCOUNT_ID`: You can get it from following command line:
 
+```
+aws sts get-caller-identity --output text --query Account
+```
 
-####External DNS name
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. 
+Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps.
 
-When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+#### External DNS name
 
-####S3 bucket
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps.
+
+#### S3 bucket
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-####Initialize an asset directory
+There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2).
+
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon).
+
+
+#### Initialize Assets
 
 Create a directory on your local machine to hold the generated assets:
 
@@ -231,284 +288,245 @@ $ cd my-cluster
 Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
 
 ```
-$ kube-aws init \
---cluster-name=my-cluster-name \
---external-dns-name=my-cluster-endpoint \
---region=us-west-1 \
---availability-zone=us-west-1c \
---key-name=key-pair-name \
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
 --kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
 ```
 
-There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
 
-####Render contents of the asset directory
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
 
-In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
 
-```
-$ kube-aws render credentials --generate-ca
-```
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
 
-The next command generates the default set of cluster assets in your asset directory.
+Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
 
-```
-sh $ kube-aws render stack
-```
+Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`)
 
-Here's what the directory structure looks like:
 
-```
-$ tree
-.
-├── cluster.yaml
-├── credentials
-│   ├── admin-key.pem
-│   ├── admin.pem
-│   ├── apiserver-key.pem
-│   ├── apiserver.pem
-│   ├── ca-key.pem
-│   ├── ca.pem
-│   ├── worker-key.pem
-│   └── worker.pem
-│   ├── etcd-key.pem
-│   └── etcd.pem
-│   ├── etcd-client-key.pem
-│   └── etcd-client.pem
-├── kubeconfig
-├── stack-template.json
-└── userdata
-    ├── cloud-config-controller
-    └── cloud-config-worker
-```
-
-These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
 
+By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3.
 
-###Kubernetes Cluster Start Up
 
-####Create the instances defined in the CloudFormation template
+#### Render contents of the asset directory
 
-Now for the exciting part, creating your cluster:
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
 
 ```
-$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+kube-aws render credentials --generate-ca
 ```
 
-####Configure DNS
-
-You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
-
-####Access the cluster
-
-Once the API server is running, you should see:
+The next command generates the default set of cluster assets in your asset directory.
 
 ```
-$ kubectl --kubeconfig=kubeconfig get nodes
-NAME                                       STATUS                     AGE
-ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
-ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
-ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+kube-aws render stack
 ```
+Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder.
 
 
-###Setup PaddlePaddle Environment on AWS
-
-Now, we've created a cluster with following network capability:
-
-1. All Kubernetes nodes can communicate with each other.
-
-1. All Docker containers on Kubernetes nodes can communicate with each other.
-
-1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
-
-1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
-
-
-For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+### Kubernetes Cluster Start Up
 
+#### Create the instances defined in the CloudFormation template
 
-1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
+Now let's create your cluster (choose any `PREFIX` for the command below):
 
-1. Create the Elastic File System in AWS console, and attach the new VPC with it.
-<center>![](src/create_efs.png)</center>
-
-
-1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
-<center>![](src/add_security_group.png)</center>
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
 
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
 
-1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
-<center>![](src/efs_mount.png)</center>
 
+#### Configure DNS
 
-Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
 
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
 
+If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
 
-###Core Concept of PaddlePaddle Training on AWS
+##### Find IP address
 
-Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+Use command `dig` to check the load balancer hostname to get the ip address.
 
-####Use Kubernetes Job
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
 
-We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
 
-We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
 
-In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
 
-####Create PaddlePaddle Node
+*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster".
 
-After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+*If you do not own a DNS name*:
+##### Update local DNS association
+Edit `/etc/hosts` to associate above ip with the DNS name.
+##### Add Route53 private name service in VPC
+ - Open [Route53 Console](https://console.aws.amazon.com/route53/home)
+ - Create hosted zone with following config
+   - Domain name: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
 
+   ![route53 zone setting](src/route53_create_zone.png)
+ - Add A record
+    - Click on the zone "paddle" just created
+    - Click the button "Create record set"
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
 
-####Start up Training
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - Verify name service
+    - Connect to any instance created by kube-aws via ssh
+    - Run command "host paddle", see if the ip returned is the private ip of kube-controller
 
-After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+#### Access the cluster
 
-With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+Once the API server is running, you should see:
 
-1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
-1. Copy the training data from EFS sharing volume into container.
-1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
-1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+```
+$ kubectl --kubeconfig=kubeconfig get nodes 
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
 
 
-###Start PaddlePaddle Training Demo on AWS
+### Setup Elastic File System for Cluster
 
-Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS.
 
-1. Build PaddlePaddle Docker image.
-1. Divide the training data file and upload it onto the EFS sharing volume.
-1. Create the training job yaml file, and start up the job.
-1. Check the result after training.
+1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)
+  1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below)
+  <center>![](src/worker_security_group.png)</center>
+  2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets).
+  <center>![](src/add_security_group.png)</center>
 
-####Build PaddlePaddle Docker Image
+2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`.
+<center>![](src/create_efs.png)</center>
 
-PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
 
-1. Copy the training data into container.
-1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+### Start PaddlePaddle Training Demo on AWS
 
+#### Configure Kubernetes Volume that Points to EFS
 
-Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn.
 
+Save following snippet as `pv.yaml`
 ```
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER zjsxzong89@gmail.com
-
-COPY start.sh /root/
-COPY start_paddle.py /root/
-CMD ["bash"," -c","/root/start.sh"]
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
 ```
 
-At this point, we will copy our `start.sh` and `start_paddle.py` file into container, and then exec `start_paddle.py` script to start up the training, all the steps like assigning trainer_id, getting other nodes' ip are implemented in `start_paddle.py`.
-
-`start_paddle.py` will start parsing the parameters.
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`
 
+Run following command to create a persistent volumn:
 ```
-parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-    podlist = getPodList()
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
 ```
 
-And then using function `getPodList()` to query all the pod information from the job name through Kubernetes api server. When all the pods are in the running status, using `getIdMap(podlist)` to get the trainer_id.
+Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume.
 
+Save following snippet as `pvc.yaml`.
 ```
-    podlist = getPodList()
-    # need to wait until all pods are running
-    while not isPodAllRunning(podlist):
-        time.sleep(10)
-        podlist = getPodList()
-    idMap = getIdMap(podlist)
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
 ```
 
-In function `getIdMap(podlist)`, we use podlist to get the ip address for each pod and sort them, use the index as the trainer_id.
-
+Run following command to create a persistent volumn claim:
 ```
-def getIdMap(podlist):
-    '''
-    generate tainer_id by ip
-    '''
-    ips = []
-    for pod in podlist["items"]:
-        ips.append(pod["status"]["podIP"])
-    ips.sort()
-    idMap = {}
-    for i in range(len(ips)):
-        idMap[ips[i]] = i
-    return idMap
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
 ```
 
-After getting `idMap`, we use function `startPaddle(idMap, train_args_dict)` to generate `paddle pserver` and `paddle train` start up parameters and then start up the processes.
+#### Prepare Training Data
 
-In function `startPaddle`, the most important work is to generate `paddle pserver` and `paddle train` start up parameters. For example, `paddle train` parameter parsing, we will get parameters like `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM`, and get the `trainer_id` from `idMap`.
+We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created.
 
+save following snippet as `paddle-data-job.yaml`
 ```
-    program = 'paddle train'
-    args = " --nics=" + PADDLE_NIC
-    args += " --port=" + str(PADDLE_PORT)
-    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for ip in idMap.keys():
-        ip_string += (ip + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    localIP = socket.gethostbyname(socket.gethostname())
-    trainerId = idMap[localIP]
-    args += " " + args_ext + " --trainer_id=" + \
-        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
-```
-
-Use `docker build` to build toe Docker Image:
-
-```
-docker build -t your_repo/paddle:mypaddle .
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddledev/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
 ```
 
-And then push the built image onto docker registry.
-
+Run following command to launch the job:
 ```
-docker push  your_repo/paddle:mypaddle
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
 ```
 
-####Upload Training Data File
-
-Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
-
+Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1`
 ```
-efs
-└── paddle-cluster-job
-    ├── data
-    │   ├── 0
-    │   │
-    │   ├── 1
-    │   │
-    │   └── 2
-    ├── output
-    └── recommendation
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
 ```
 
-The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
-
-
-####Create Kubernetes Job
-
-Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
-
-In yaml file, we describe the Docker image we use for this training, the node number we need to startup, the volume mounting information and all the necessary parameters we need for `paddle pserver` and `paddle train` processes.
+Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
 
-The yaml file content is as follows:
+#### Start Training
 
+Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml`
 ```
 apiVersion: batch/v1
 kind: Job
@@ -522,12 +540,12 @@ spec:
       name: paddle-cluster-job
     spec:
       volumes:
-      - name: jobpath
-        hostPath:
-          path: /home/admin/efs
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
       containers:
       - name: trainer
-        image: drinkcode/paddle:k8s-job
+        image: paddledev/paddle-tutorial:k8s_train
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -537,7 +555,7 @@ spec:
         - name: JOB_NAMESPACE
           value: default
         - name: TRAIN_CONFIG_DIR
-          value: recommendation
+          value: quick_start
         - name: CONF_PADDLE_NIC
           value: eth0
         - name: CONF_PADDLE_PORT
@@ -548,119 +566,124 @@ spec:
           value: "2"
         - name: CONF_PADDLE_GRADIENT_NUM
           value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
         volumeMounts:
-        - name: jobpath
-          mountPath: /home/jobpath
+        - mountPath: "/home/jobpath"
+          name: efs
         ports:
-        - name: jobport
-          hostPort: 30001
-          containerPort: 30001
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
       restartPolicy: Never
-
 ```
 
-In yaml file, the metadata's name is the job's name. `parallelism, completions` means this job will simultaneously start up 3 PaddlePaddle nodes, and this job will be finished when there are 3 finished pods. For the data store volume, we declare the path jobpath, it mount the /home/admin/efs on host machine into the container with path /home/jobpath. So in container, the /home/jobpath actually stores the data onto EFS sharing volume.
-
-`env` field represents container's environment variables, we pass the PaddlePaddle parameters into containers by using the `env` field.
+`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods.
 
-`JOB_PATH` represents the sharing volume path, `JOB_NAME` represents job name, `TRAIN_CONFIG_DIR` represents the training data file directory, we can these three parameters to get the file path for this training.
+`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables.
 
-`CONF_PADDLE_NIC` represents `paddle pserver` process's `--nics` parameters, the NIC name.
+`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency.
 
-`CONF_PADDLE_PORT` represents `paddle pserver` process's `--port` parameters, `CONF_PADDLE_PORTS_NUM` represents `--port_num` parameter.
-
-`CONF_PADDLE_PORTS_NUM_SPARSE` represents the sparse updated port number, `--ports_num_for_sparse` parameter.
+Run following command to launch the job.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
 
-`CONF_PADDLE_GRADIENT_NUM` represents the training node number, `--num_gradient_servers` parameter.
+Inspect individual pods
 
-After we create the yaml file, we can use Kubernetes command line tool to create the job onto the cluster.
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
 
+Inspect individual console output
 ```
-kubectl create -f job.yaml
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
 ```
 
-After we execute the above command, Kubernetes will create 3 pods and then pull the PaddlePaddle image, then start up the containers for training.
+`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`).
 
+Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
 
+The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
 
-####Check Training Results
+#### Inspect Training Output
 
-During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
+Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output.
 
+1. ssh Into Worker EC2 instance
 ```
-[root@paddle-kubernetes-node0 output]# tree -d
-.
-├── node_0
-│   ├── server.log
-│   └── train.log
-├── node_1
-│   ├── server.log
-│   └── train.log
-├── node_2
-......
-├── pass-00002
-│   ├── done
-│   ├── ___embedding_0__.w0
-│   ├── ___embedding_1__.w0
-......
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
 ```
 
-We can always check the container training status through logs, for example:
+`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance.
 
+2. Mount EFS
 ```
-[root@paddle-kubernetes-node0 node_0]# cat train.log
-I1116 09:10:17.123121    50 Util.cpp:155] commandline:
- /usr/local/bin/../opt/paddle/bin/paddle_trainer
-    --nics=eth0 --port=7164
-    --ports_num=2 --comment=paddle_process_by_paddle
-    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
-    --ports_num_for_sparse=2 --config=./trainer_config.py
-    --trainer_count=4 --num_passes=10 --use_gpu=0 
-    --log_period=50 --dot_period=10 --saving_period=1 
-    --local=0 --trainer_id=0
-    --save_dir=/home/jobpath/paddle-cluster-job/output
-I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
-I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
-[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
-[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
-I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
-I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
-I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
-I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
-I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
-I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
-I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
-I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
-I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
-I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
-I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
 ```
 
-It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`.
 
+Now folder `efs` will have structure similar to:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`.
 
-###Kubernetes Cluster Tear Down
+### Kubernetes Cluster Tear Down
 
+#### Delete EFS
 
-If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
+Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created.
 
-```
-kube-aws destroy
-```
-It's an async call, it might take 5 min to tear down the whole cluster.
+#### Delete security group
 
-If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
+Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`.
 
 
+#### Delete S3 Bucket
 
-## For Experts with Kubernetes and AWS
+Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created.
 
-Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
+#### Destroy Cluster
+
+```
+kube-aws destroy
+```
 
-### Some Presumptions
+The command will return immediately, but it might take 5 min to tear down the whole cluster.
 
-* Instances run on CoreOS, the official IAM.
-* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
-* For networking, we use Flannel network at this moment, we will use Calico solution later on.
-* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
+You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process.
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..4c3dc81ed38f239c1f4a83d22b49cf57b5d16a8b
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
@@ -0,0 +1,75 @@
+# Kubernetes 简介
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
+
+## 部署Kubernetes集群
+
+Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
+
+- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
+- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
+- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
+- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
+
+可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
+
+## 选择存储方案
+
+容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
+常见的可选存储服务包括：
+
+- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
+- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
+- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
+- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
+
+## 配置kubectl
+
+### 安装kubectl
+```
+# OS X
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
+
+# Linux
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
+
+# Windows
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
+```
+
+### 配置kubectl访问你的kubernetes集群
+
+编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
+```
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority: /path/to/ca.crt
+    server: https://[Master-IP]:443
+  name: minikube
+contexts:
+- context:
+    cluster: minikube
+    user: minikube
+  name: minikube
+current-context: minikube
+kind: Config
+preferences: {}
+users:
+- name: minikube
+  user:
+    client-certificate: /path/to/apiserver.crt
+    client-key: /Users/wuyi/.minikube/apiserver.key
+```
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index b63b8437a0114a0165971933912da83c2dd770a6..3121b3f59df650c0a22d0bd305a6f793b202d30e 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -2,179 +2,96 @@
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
-## Kubernetes 基本概念
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、 扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
 
 ## 整体方案
 
-### 部署Kubernetes集群
-
-首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
 ![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
-上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
-
-### 使用 Job
-
-我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业，在作业完成后，Kubernetes会销毁job产生的容器并且释放相关资源。
-
-在Kubernetes中，可以通过编写一个YAML文件，来描述这个job，在这个文件中，主要包含了一些配置信息，例如PaddlePaddle的节点个数，`paddle pserver`开放的端口个数与端口号，使用的网卡设备等，这些信息通过环境变量的形式传递给容器内的程序使用。
-
-在一次分布式训练中，用户确定好本次训练需要的PaddlePaddle节点个数，将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件，提交给Kubernetes集群创建并开始作业。
-
-### 创建PaddlePaddle节点
+上图描述了一个3节点的分布式训练场景，在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
-当Kubernetes master收到请求，解析完YAML文件后，会创建出多个pod(个数为PaddlePaddle节点数)，Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点，当pod被成功分配到一台物理/虚拟机上后，Kubernetes会启动pod内的容器，这个容器会根据YAML文件中的环境变量，启动`paddle pserver`与`paddle train`进程。
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，按照下面步骤即可：
 
-### 启动训练
-
-在容器启动后，会通过脚本来启动这次分布式训练，我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id，由于PaddlePaddle本身不提供类似服务发现的功能，所以在本文的启动脚本中，每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
-
-根据这些pod信息，就可以通过某种方式，为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序，将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下：
-
-  1. 查询Kubernetes apiserver获取pod信息，根据IP分配trainer_id
-  1. 从MFS共享目录中拷贝训练文件到容器内
-  1. 根据环境变量，解析出`paddle pserver`与`paddle train`的启动参数，启动进程
-  1. 训练时，PaddlePaddle会自动将结果保存在trainer_id为0的节点上，将输出路径设置为MFS目录，保存输出的文件
-
-
-## 搭建过程
-
-根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，主要分为以下几个步骤：
-
-1. 制作PaddlePaddle镜像
-1. 将训练文件与切分好的数据上传到共享存储
-1. 编写本次训练的YAML文件，创建一个Kubernetes job
-1. 训练结束后查看输出结果
+1. [制作PaddlePaddle镜像](#制作镜像)
+1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
+1. [编写本次训练的YAML文件，创建一个Kubernetes job](#创建Job)
+1. [训练结束后查看输出结果](#查看输出)
 
 下面就根据这几个步骤分别介绍。
 
-
 ### 制作镜像
 
 PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
 
 - 拷贝训练文件到容器内
-
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。镜像的*Dockerfile*如下：
-
-```Dockerfile
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER zjsxzong89@gmail.com
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
 
-COPY start.sh /root/
-COPY start_paddle.py /root/
-CMD ["bash"," -c","/root/start.sh"]
-```
-
-[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内，然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
-
-`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
-
-```python
-parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-    podlist = getPodList()
-```
-
-然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
-
-```python
-    podlist = getPodList()
-    # need to wait until all pods are running
-    while not isPodAllRunning(podlist):
-        time.sleep(10)
-        podlist = getPodList()
-    idMap = getIdMap(podlist)
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
 ```
 
-在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+然后将构建成功的镜像上传到镜像仓库。
 
-```python
-def getIdMap(podlist):
-    '''
-    generate tainer_id by ip
-    '''
-    ips = []
-    for pod in podlist["items"]:
-        ips.append(pod["status"]["podIP"])
-    ips.sort()
-    idMap = {}
-    for i in range(len(ips)):
-        idMap[ips[i]] = i
-    return idMap
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
 ```
 
-在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
 
-在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+### 准备训练数据
 
-```python
-    program = 'paddle train'
-    args = " --nics=" + PADDLE_NIC
-    args += " --port=" + str(PADDLE_PORT)
-    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for ip in idMap.keys():
-        ip_string += (ip + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    localIP = socket.gethostbyname(socket.gethostname())
-    trainerId = idMap[localIP]
-    args += " " + args_ext + " --trainer_id=" + \
-        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
-```
+这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据，也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image.
 
-使用 `docker build` 构建镜像：
+在启动Job之前，需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下.
 
-```bash
-docker build -t your_repo/paddle:mypaddle .
-```
-
-然后将构建成功的镜像上传到镜像仓库。
-
-```bash
-docker push  your_repo/paddle:mypaddle
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddledev/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
 ```
 
-### 上传训练文件
-
-本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
-
-```bash
-[root@paddle-kubernetes-node0 mfs]# tree -d
+完成后volume中的文件内容大致如下：
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
 .
-└── paddle-cluster-job
-    ├── data
-    │   ├── 0
-    │   │
-    │   ├── 1
-    │   │
-    │   └── 2
-    ├── output
-    └── recommendation
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
 ```
 
 目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
@@ -203,7 +120,7 @@ spec:
           path: /home/work/mfs
       containers:
       - name: trainer
-        image: your_repo/paddle:mypaddle
+        image: [YOUR_REPO]/paddle:mypaddle
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -234,15 +151,18 @@ spec:
 
 `env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
 
-`JOB_PATH`表示共享存储挂载的路径，`JOB_NAME`表示job名字，`TRAIN_CONFIG_DIR`表示本次训练文件所在目录，这三个变量组合就可以找到本次训练需要的文件路径。
-
-`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数，即网卡名
+环境变量 | 说明
+--- | ---
+JOB_PATH | 共享存储挂在的路径
+JOB_NAME | Job的名字
+TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
+CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
+CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
+CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
 
-`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数，`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量，也就是`--ports_num`参数。
-
-`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量，也就是`--ports_num_for_sparse`参数。
-
-`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
 
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
@@ -285,15 +205,15 @@ I1116 09:10:17.123121    50 Util.cpp:155] commandline:
     --ports_num=2 --comment=paddle_process_by_paddle
     --pservers=192.168.129.66,192.168.223.143,192.168.129.71
     --ports_num_for_sparse=2 --config=./trainer_config.py
-    --trainer_count=4 --num_passes=10 --use_gpu=0 
-    --log_period=50 --dot_period=10 --saving_period=1 
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
     --local=0 --trainer_id=0
     --save_dir=/home/jobpath/paddle-cluster-job/output
 I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
@@ -306,3 +226,90 @@ I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:
 I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
 I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
 ```
+
+
+## 一些细节的补充
+
+### 使用环境变量
+
+使用容器方式运行训练任务的Kubernetes Job，通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本，将环境变量转换成paddle的命令行参数：
+```
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Pod间通信
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+* *注意*: `getPodList()`会获取当前namespace下的所有pod，如果已经有pod运行，可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+### 启动任务
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
index 50eed4c6573a18d6ae0f9df9bd6a3cae05493e3c..bd34f46c9b0ada7027fd53e553e7d033255d25fc 100644
Binary files a/doc/howto/usage/k8s/src/add_security_group.png and b/doc/howto/usage/k8s/src/add_security_group.png differ
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
index f4d448d1518e11a11d535efb9c3a78b56cc13149..e5f1526033d1daf401700989af1d25919bcb7675 100644
Binary files a/doc/howto/usage/k8s/src/create_efs.png and b/doc/howto/usage/k8s/src/create_efs.png differ
diff --git a/doc/howto/usage/k8s/src/job.yaml b/doc/howto/usage/k8s/src/job.yaml
deleted file mode 100644
index 488aad0bede4f940b25c7be04259f209c3de9f52..0000000000000000000000000000000000000000
--- a/doc/howto/usage/k8s/src/job.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: paddle-cluster-job
-spec:
-  parallelism: 3
-  completions: 3
-  template:
-    metadata:
-      name: paddle-cluster-job
-    spec:
-      volumes:
-      - name: jobpath
-        hostPath: 
-          path: /home/work/paddle_output              
-      containers:
-      - name: trainer
-        image: registry.baidu.com/public/paddle:mypaddle
-        command: ["bin/bash",  "-c", "/root/start.sh"]        
-        env:
-        - name: JOB_NAME
-          value: paddle-cluster-job
-        - name: JOB_PATH
-          value: /home/jobpath     
-        - name: JOB_NAMESPACE
-          value: default         
-        - name: TRAIN_CONFIG_DIR
-          value: recommendation
-        - name: CONF_PADDLE_NIC
-          value: eth0  
-        - name: CONF_PADDLE_PORT
-          value: "7164"
-        - name: CONF_PADDLE_PORTS_NUM
-          value: "2"     
-        - name: CONF_PADDLE_PORTS_NUM_SPARSE
-          value: "2"  
-        - name: CONF_PADDLE_GRADIENT_NUM
-          value: "3"                                                               
-        volumeMounts:
-        - name: jobpath
-          mountPath: /home/jobpath       
-      restartPolicy: Never
-    
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
index a8c64550b1fa7f41de1eaa9a037c65cddc0cd30e..2183a232ad402b76f82a67234a5c93e13ce97ac3 100644
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and b/doc/howto/usage/k8s/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6d3a12ae393aa594b8e6e9a5f726109426937284
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
@@ -0,0 +1,7 @@
+FROM alpine
+
+RUN apk update && apk upgrade && apk add coreutils
+ADD quick_start /quick_start
+ADD get_data.sh /bin/
+RUN chmod +x /bin/get_data.sh
+ENTRYPOINT ["/bin/get_data.sh"]
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/k8s/src/k8s_data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..83cef7affd0ac4d3a1ca08ea5b046fa81e1bc630
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
@@ -0,0 +1,6 @@
+To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
+
+```
+cp -r ../../../../../../demo/quick_start .
+docker build . -t prepare-data-image-name
+```
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d187ba5ac8d03f69dfdefd4f63610ed7921575be
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+out_dir=$OUT_DIR
+split_count=$SPLIT_COUNT
+
+set -e
+
+mkdir -p $out_dir
+cp -r /quick_start $out_dir/
+
+mkdir -p $out_dir/0/data
+cd $out_dir/0/data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+tar zxvf preprocessed_data.tar.gz
+rm preprocessed_data.tar.gz
+
+split -d --number=l/$split_count -a 5 train.txt train.
+mv train.00000 train.txt
+
+cd $out_dir
+end=$(expr $split_count - 1)
+for i in $(seq 1 $end); do
+    mkdir -p $i/data
+    cp -r 0/data/* $i/data
+    mv $i/data/train.`printf %05d $i` $i/data/train.txt
+done;
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c0fca1f9a945921e6e8899fee2db8845e66136a1
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddledev/paddle:cpu-latest
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+RUN chmod +x /root/start.sh
+CMD ["bash"," -c","/root/start.sh"]
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/k8s/src/k8s_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96bf65497ffa23e90c4c9350504f86367b48daf2
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
@@ -0,0 +1,5 @@
+To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
+
+```
+docker build . -t train-image-name
+```
diff --git a/doc/howto/usage/k8s/src/start.sh b/doc/howto/usage/k8s/src/k8s_train/start.sh
similarity index 55%
rename from doc/howto/usage/k8s/src/start.sh
rename to doc/howto/usage/k8s/src/k8s_train/start.sh
index b3a1334174a20b018d35de3b01b149fc5b10d49d..12dfe1e6386885a6989d3887f21c6922f137a9ae 100755
--- a/doc/howto/usage/k8s/src/start.sh
+++ b/doc/howto/usage/k8s/src/k8s_train/start.sh
@@ -1,19 +1,19 @@
 #!/bin/sh
+
 set -eu
 
 jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
 cd /root
-cp -rf $jobconfig .
-cd $TRAIN_CONFIG_DIR
-
+cp -rf $jobconfig/* .
 
 python /root/start_paddle.py \
   --dot_period=10 \
-  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
   --log_period=50 \
   --num_passes=10 \
-  --trainer_count=4 \
+  --trainer_count=$TRAINER_COUNT \
   --saving_period=1 \
   --local=0 \
-  --config=./trainer_config.py \
+  --config=trainer_config.lr.py \
   --use_gpu=0
diff --git a/doc/howto/usage/k8s/src/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
similarity index 83%
rename from doc/howto/usage/k8s/src/start_paddle.py
rename to doc/howto/usage/k8s/src/k8s_train/start_paddle.py
index df00d82919faa2acecc79c28e3d773ba3de9672a..935c12bb67e1fe08bc135a7a2220fcd43c548482 100755
--- a/doc/howto/usage/k8s/src/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
@@ -23,7 +23,6 @@ import argparse
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
 JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
-JOB_PATH_DATA = JOB_PATH + "/data"
 JOB_PATH_OUTPUT = JOB_PATH + "/output"
 JOBNAME = os.getenv("JOB_NAME")
 NAMESPACE = os.getenv("JOB_NAMESPACE")
@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
 PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
 PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
 
+tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+
 
 def refine_unknown_args(cmd_args):
     '''
@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
     for pod in podlist["items"]:
         if pod["status"]["phase"] == "Running":
             running += 1
+    print "waiting for pods running, require:", require, "running:", running
     if require == running:
         return True
     return False
@@ -79,8 +81,17 @@ def getPodList():
 
     pod = API + NAMESPACE + "/pods?"
     job = JOBNAME
-    return requests.get(apiserver + pod + JOBSELECTOR + job,
-                        verify=False).json()
+    if os.path.isfile(tokenpath):
+        tokenfile = open(tokenpath, mode='r')
+        token = tokenfile.read()
+        Bearer = "Bearer " + token
+        headers = {"Authorization": Bearer}
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            headers=headers,
+                            verify=False).json()
+    else:
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            verify=False).json()
 
 
 def getIdMap(podlist):
@@ -121,9 +132,10 @@ def startPaddle(idMap={}, train_args_dict=None):
     logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
     if not os.path.exists(JOB_PATH_OUTPUT):
         os.makedirs(JOB_PATH_OUTPUT)
-    os.mkdir(logDir)
-    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
-        "/" + str(trainerId) + " ./data"
+    if not os.path.exists(logDir):
+        os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH + \
+        "/" + str(trainerId) + "/data/*" + " ./data/"
     os.system(copyCommand)
     startPserver = 'nohup paddle pserver' + \
         " --port=" + str(PADDLE_PORT) + \
@@ -136,9 +148,9 @@ def startPaddle(idMap={}, train_args_dict=None):
     print startPserver
     os.system(startPserver)
     # wait until pservers completely start
-    time.sleep(10)
-    startTrainer = program + args + " > " + \
-        logDir + "/train.log 2>&1 < /dev/null"
+    time.sleep(20)
+    startTrainer = program + args + " 2>&1 | tee " + \
+        logDir + "/train.log"
     print startTrainer
     os.system(startTrainer)
 
@@ -152,7 +164,7 @@ if __name__ == '__main__':
     podlist = getPodList()
     # need to wait until all pods are running
     while not isPodAllRunning(podlist):
-        time.sleep(10)
+        time.sleep(20)
         podlist = getPodList()
     idMap = getIdMap(podlist)
     startPaddle(idMap, train_args_dict)
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/k8s/src/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/howto/usage/k8s/src/pserver_and_trainer.png differ
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/k8s/src/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/howto/usage/k8s/src/route53_create_recordset.png differ
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/k8s/src/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/howto/usage/k8s/src/route53_create_zone.png differ
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/k8s/src/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/howto/usage/k8s/src/worker_security_group.png differ
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 460fedb5658a8ea9bbe8b602ee2b5df66502fa62..9279bac7f4b2898c18979630a8d6dfcb2dba70e0 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,7 +5,6 @@ PaddlePaddle 文档
   :maxdepth: 1
 
   getstarted/index_cn.rst
-  tutorials/index_cn.md
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 1d9cca7de720ebc23fe816f32d158930d91c07e7..168c7667c61da677905585d6c4b5037ce80b3765 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,8 +5,6 @@ PaddlePaddle Documentation
   :maxdepth: 1
 
   getstarted/index_en.rst
-  tutorials/index_en.md
   howto/index_en.rst
   api/index_en.rst
   about/index_en.rst
- 
\ No newline at end of file
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 418d718fbd9c61bff3acb9c2dab0638c0b650bab..95cad835b11816f4d2e256c2abd662a545a5bad2 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -15,13 +15,19 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
@@ -49,6 +55,7 @@ extensions = [
     'sphinx.ext.napoleon',
     'sphinx.ext.graphviz'
 ]
+mathjax_path="https://cdn.bootcss.com/mathjax/2.7.0/MathJax.js"
 table_styling_embed_css = True
 
 autodoc_member_order = 'bysource'
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e96c25cb75bee20d2e2949423d80ddab1d3450a1..b477f0120c4fa0544012080b7cfb8572d3c44b04 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -15,14 +15,20 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
+
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
-
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
index fe800308d8d7a03619ec8e13fd8dc4aa7a8ed8be..2b4a79fbbfc0c4af74aa73c540919f5d9cf2635b 100644
--- a/doc/tutorials/embedding_model/index_cn.md
+++ b/doc/tutorials/embedding_model/index_cn.md
@@ -6,9 +6,10 @@
 
 ## 介绍 ###
 ### 中文字典 ###
-我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206325个词和3个特殊标记：
+我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206326个词和4个特殊标记：
   - `<s>`: 分词序列的开始
   - `<e>`: 分词序列的结束
+  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: 占位符，没有实际意义
   - `<unk>`: 未知词
 
 ### 中文词向量的预训练模型 ###
diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/tutorials/embedding_model/index_en.md
index d793a50f488e464bcd90a2fb506a8dcc3c760433..9525f64f9b5384c8e44690fb0887fb2293108e0a 100644
--- a/doc/tutorials/embedding_model/index_en.md
+++ b/doc/tutorials/embedding_model/index_en.md
@@ -6,9 +6,10 @@ We thank @lipeng for the pull request that defined the model schemas and pretrai
 
 ## Introduction ###
 ### Chinese Word Dictionary ###
-Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《"，"红楼梦"，"》"，and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206325, including 3 special token:
+Our Chinese-word dictionary is created on Baidu ZhiDao and Baidu Baike by using in-house word segmentor. For example, the participle of "《红楼梦》" is "《"，"红楼梦"，"》"，and "《红楼梦》". Our dictionary (using UTF-8 format) has has two columns: word and its frequency. The total word count is 3206326, including 4 special token:
   - `<s>`: the start of a sequence
   - `<e>`: the end of a sequence
+  - `PALCEHOLDER_JUST_IGNORE_THE_EMBEDDING`: a placeholder, just ignore it and its embedding
   - `<unk>`: a word not included in dictionary
 
 ### Pretrained Chinese Word Embedding Model ###
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
index 70dec2eb2a8c397bc56b1e6f52a624a3a6877905..ca110431cf921ae0480d3fb2b17c58f90a84cc0e 100644
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -156,14 +156,14 @@ define_py_data_sources2(train_list='data/train.list',
                         obj="process",
                         args={"dictionary": word_dict})
 ```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
+You can refer to the following link for more detailed examples and data formats: <a href = "../../api/v1/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
 
 ## Network Architecture
 We will describe four kinds of network architectures in this section.
 <center> ![](./src/PipelineNetwork_en.jpg) </center>
 
 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
+For more detailed documentation, you could refer to: <a href = "../../api/v1/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
 
 ### Logistic Regression
 The architecture is illustrated in the following picture:
@@ -366,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
 <br>
 
 ## Optimization Algorithm
-<a href = "../../api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+<a href = "../../api/v1/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
 
 ```python
 settings(batch_size=128,
@@ -407,7 +407,7 @@ paddle train \
 --init_model_path=./output/pass-0000x
 ```
 
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/v1/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
 
 inference script (predict.sh)：
 
diff --git a/doc_theme/static/css/override.css b/doc_theme/static/css/override.css
index 438a87848a0176a7857177aeb672c59f35bd8d4b..09ecff688b9a2dae3d834572217922640c529c5e 100644
--- a/doc_theme/static/css/override.css
+++ b/doc_theme/static/css/override.css
@@ -1,3 +1,6 @@
+* {
+    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
 body {
     padding-top: 80px;
     background-image: none !important;
diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html
index 034740369ed10a748856e2205d3315f51a7de62f..65e61c5f298e19adc6330c378779a6edf418752e 100644
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
@@ -114,10 +114,7 @@
           </ul>
         </div>
         <ul class="site-page-links">
-          <li><a>Home</a></li>
-          <li><a>Get Started</a></li>
-          <li class="active"><a>Documentation</a></li>
-          <li><a>About Us</a></li>
+          <li><a href="/">Home</a></li>
         </ul>
       </div>
       <div class="doc-module">
@@ -137,7 +134,7 @@
           {{ toctree }}
         {% endblock %}
     </nav>
-    {% if toc %}
+    {% if False %}
     <nav class="local-toc">{{ toc }}</nav>
     {% endif %}
     <section class="doc-content-wrap">
@@ -168,7 +165,8 @@
             VERSION:'{{ release|e }}',
             COLLAPSE_INDEX:false,
             FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
-            HAS_SOURCE:  {{ has_source|lower }}
+            HAS_SOURCE:  {{ has_source|lower }},
+            SOURCELINK_SUFFIX: ".txt",
         };
     </script>
     {%- for scriptfile in script_files %}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 503024cff338dac42a6a8a32463472dc6b6451d9..c6fd9cc54ae3a671c5bdcf54cbaa873c59280694 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -9,13 +9,20 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
 
-configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
-    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
+find_package(boost QUIET)
 
-if(WITH_PREDICT_SDK)
-    add_subdirectory(predict)
+if(Boost_FOUND)
+  include_directories(${Boost_INCLUDE_DIRS})
+  include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+  add_subdirectory(majel)
+endif()
+
+if(WITH_C_API)
+    add_subdirectory(capi)
 endif()
 
 if(WITH_SWIG_PY)
+  configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+          ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
   add_subdirectory(api)
 endif()
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 0cafbd896e2d88aee4406bd0305878ce489bc18d..d49b189e253f7a0792fe3f1fe7c8fdbb7071acd4 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
   return args;
 }
 
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
 Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return Matrix::createByPaddleMatrixPtr(&a.value);
@@ -137,6 +144,8 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
   a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }
 
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
+
 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return a.getBatchSize();
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 3ac50e34bb434b14d346f1c4707084f93461284d..1cec77c0cae6ffbf7a1ca22092e8e41a6f9f0fc5 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,21 +1,3 @@
-FUNCTION(generate_python_api target_name)
-    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-                ${external_project_dependencies}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                ${external_project_dependencies})
-ENDFUNCTION(generate_python_api)
-
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
@@ -33,78 +15,101 @@ set(API_HEADER
     PaddleAPI.h
     Internal.h)
 
-add_library(paddle_api STATIC
-        ${API_SOURCES})
+add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
-list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
+INCLUDE(${SWIG_USE_FILE})
+INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
 
-if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-# Because gflags compiled by cmake, so it is imported by cmake target,
-# not a real library path. Get the real library path here.
-message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-else()
-set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-endif()
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
+SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
-configure_file(
-    paddle_api_config.py.in
-    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
+SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
+IF(WITH_COVERAGE)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+ENDIF(WITH_COVERAGE)
+
+SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
+    paddle_parameter
+    paddle_function
+    paddle_math
+    paddle_utils
+    paddle_gserver
+    paddle_pserver
+    paddle_api
+    paddle_cuda
+    paddle_trainer_lib
+    paddle_network
+    paddle_proto
+    ${external_project_dependencies}
 )
 
-generate_python_api(python_swig_sources)
+IF(APPLE)
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+ELSE(APPLE)
+    SET(START_GROUP "-Xlinker -start-group")
+    SET(END_GROUP "-Xlinker -end-group")
+    SET(ARCHIVE_START "-Wl,--whole-archive")
+    SET(ARCHIVE_END "-Wl,--no-whole-archive")
+ENDIF(APPLE)
 
-file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+SWIG_ADD_MODULE(swig_paddle python Paddle.i)
+SWIG_LINK_LIBRARIES(swig_paddle
+    ${MACOS_LD_FLAGS}
+    ${START_GROUP}
+    ${ARCHIVE_START}
+    paddle_gserver
+    paddle_function
+    ${METRIC_LIBS}
+    ${ARCHIVE_END}
+    paddle_pserver
+    paddle_trainer_lib
+    paddle_network
+    paddle_parameter
+    paddle_math
+    paddle_utils
+    paddle_proto
+    paddle_cuda
+    paddle_api
+    ${CMAKE_DL_LIBS}
+    ${EXTERNAL_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${START_END}
+)
 
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
+add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-    DEPENDS python_swig_sources
-            paddle_parameter
-            paddle_function
-            paddle_math
-            paddle_utils
-            paddle_gserver
-            paddle_pserver
-            paddle_trainer
-            paddle_api
-            paddle_cuda
-        ${PY_PADDLE_PYTHON_FILES}
+    DEPENDS _swig_paddle
 )
 
-install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
+# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
+
+install(DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/dist/
     DESTINATION opt/paddle/share/wheels
 )
 
-add_custom_target(python_api_wheel ALL DEPENDS
-  ${PROJ_ROOT}/paddle/dist/.timestamp)
-add_dependencies(python_api_wheel python_swig_sources
-  paddle_parameter
-  paddle_math
-  paddle_utils
-  paddle_gserver
-  paddle_pserver
-  paddle_trainer
-  paddle_api
-  paddle_cuda)
-
 if(WITH_TESTING)
-    SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
-    ExternalProject_Add(pip
-        ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      https://github.com/pypa/pip.git
-        GIT_TAG             9.0.1
-        PREFIX              ${PIP_SOURCES_DIR}
-        CONFIGURE_COMMAND   ""
-        BUILD_COMMAND       ""
-        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
-        BUILD_IN_SOURCE     1
-        DEPENDS python setuptools python_api_wheel
-    )
+    IF(NOT PY_PIP_FOUND)
+        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
+        ExternalProject_Add(pip
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY      https://github.com/pypa/pip.git
+            GIT_TAG             9.0.1
+            PREFIX              ${PIP_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+            BUILD_IN_SOURCE     1
+            DEPENDS python setuptools python_api_wheel
+        )
+    ENDIF()
     add_subdirectory(test)
 endif()
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index c30e09876397e37ef9ed4ec3200d1aa372ceb609..681e3a380912339c531c16c88f43255c2f34c32f 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -27,3 +27,18 @@ std::string Evaluator::toString() {
   m->rawPtr->printStats(sout);
   return sout.str();
 }
+
+std::vector<std::string> Evaluator::getNames() const {
+  std::vector<std::string> retv;
+  m->rawPtr->getNames(&retv);
+  return retv;
+}
+
+double Evaluator::getValue(const std::string name) const {
+  paddle::Error err;
+  double v = m->rawPtr->getValue(name, &err);
+  if (err) {
+    throw std::runtime_error(err.msg());
+  }
+  return v;
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 66115f8293b905809639afff779abfdb2bb3a54e..dcb5fe086fdccf8ec62ee52cbaaac4b7dbbe2f9d 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -142,14 +142,28 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
   }
 }
 
+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
-Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
     throw(UnsupportError) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
+  auto nn = m->machine;
   if (nn) {
-    auto mat = nn->getLayerOutput(layerName);
-    return Matrix::createByPaddleMatrixPtr(&mat);
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
   } else {
     throw UnsupportError();
   }
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.i
similarity index 100%
rename from paddle/api/Paddle.swig
rename to paddle/api/Paddle.i
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 364d19f9414430709108824dce75a1007332d824..d51204012171c9887acd5f578f913143182efe36 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 
@@ -47,6 +48,9 @@ void setUseGpu(bool useGpu);
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
 /// The Error of IO Operation. Such as file not found, etc.
 class IOError {};
 
@@ -450,8 +454,11 @@ public:
                                         IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
+  float sum() const;
+
 private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
 private:
@@ -462,8 +469,10 @@ private:
 };
 
 enum GradientMatchineCreateMode {
-  CREATE_MODE_NORMAL = 0,
-  CREATE_MODE_TESTING = 4
+  CREATE_MODE_NORMAL = paddle::GradientMachine::kNormal,
+  CREATE_MODE_SGD_SPARSE_CPU_TRAINING =
+      paddle::GradientMachine::kSgdSparseCpuTraining,
+  CREATE_MODE_TESTING = paddle::GradientMachine::kTesting
 };
 
 struct ParameterConfigPrivate;
@@ -546,6 +555,10 @@ public:
   ParameterConfig* getConfig();
   void setValueUpdated();
 
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
   size_t getSize() const;
 
 private:
@@ -761,9 +774,12 @@ public:
   size_t getParameterSize() const;
   Parameter* getParameter(size_t i) throw(RangeError);
 
+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
   void randParameters();
 
-  Matrix* getLayerOutput(const std::string& layerName) const
+  Arguments* getLayerOutput(const std::string& layerName) const
       throw(UnsupportError);
 
   /**
@@ -804,7 +820,8 @@ private:
 public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
-                                               int passCount);
+                                               int passCount,
+                                               bool useSparseUpdater);
   ~ParameterUpdater();
 
   /**
@@ -842,6 +859,13 @@ public:
    */
   void update(Parameter* param);
 
+  /**
+   * @breif only get required sparse rows by default.
+   * @param fullSize: get full matrix parameter if *fullSize* set
+   * @param apply: get PARAMETER_APPLY on pserver if *apply* set
+   */
+  void getParametersRemote(bool fullSize = false, bool apply = false);
+
   /**
    * @brief restore the average parameter.
    * @note It is only used in AverageOptimizer. Restore will get the current
@@ -894,6 +918,10 @@ public:
    */
   std::string toString();
 
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
 private:
   EvaluatorPrivate* m;
 
@@ -946,7 +974,7 @@ public:
 
   Arguments* getForwardOutput();
 
-  Matrix* getLayerOutput(const std::string& layerName);
+  Arguments* getLayerOutput(const std::string& layerName) const;
 };
 
 /// the N-Best results generated from one input sequence.
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index ddc00d8d1af4c58d7e2233423bea916408bee92b..19f7a898d6b8d3d02c5654559dcb86728266731e 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -57,4 +57,12 @@ size_t Parameter::getID() const { return m->getPtr()->getID(); }
 
 void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
 
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
+
 size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 75b0ae7cb6cc8c9ad0f8fe69963b7439a44bf55e..79921ea6e787f3c0ebecaad6a9a54bac92211320 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -29,10 +29,22 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 }
 
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
-    OptimizationConfig *config, int passCount) {
+    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
   auto updater = new ParameterUpdater();
-  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
-      config->m->getConfig(), passCount, nullptr));
+  auto remoteUpdater = new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr);
+  if (useSparseUpdater) {
+    std::unique_ptr<paddle::ParameterUpdater> remoteUpdaterPtr(remoteUpdater);
+    auto sparseRemoteUpdater =
+        new paddle::SparseRemoteParameterUpdaterComposite(
+            config->m->getConfig(),
+            passCount,
+            false,
+            std::move(remoteUpdaterPtr));
+    updater->m->updater.reset(sparseRemoteUpdater);
+  } else {
+    updater->m->updater.reset(remoteUpdater);
+  }
   return updater;
 }
 
@@ -59,6 +71,10 @@ void ParameterUpdater::update(Parameter *param) {
   m->updater->update(paddleParam);
 }
 
+void ParameterUpdater::getParametersRemote(bool fullSize, bool apply) {
+  m->updater->getParametersRemote(fullSize, apply);
+}
+
 void ParameterUpdater::restore() { m->updater->restore(); }
 
 void ParameterUpdater::apply() { m->updater->apply(); }
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index d83dc380beeec3747451a483f4811eb833e8c226..84e4ca054abb0100a02c8a40e31c49c17684ef40 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
 void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
 void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
-Matrix* Trainer::getLayerOutput(const std::string& layerName) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-      this->m->getGradientMachine());
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto m = nn->getLayerOutput(layerName);
-  return Matrix::createByPaddleMatrixPtr(&m);
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
 }
 
 void Trainer::forwardOneBatch(size_t batchSize) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 54d67aa62f4d87ad03282962c722019698dc621a..d369df5d4e04b4a8d822db0e72a8051150868ce6 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -54,5 +54,7 @@ bool isGpuVersion() {
 #endif
 }
 
+int getTrainerCount() { return FLAGS_trainer_count; }
+
 static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
               "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
deleted file mode 100644
index e11ee920362aed3ec79a2e62d447d7dde4a99248..0000000000000000000000000000000000000000
--- a/paddle/api/paddle_api_config.py.in
+++ /dev/null
@@ -1,17 +0,0 @@
-PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
-WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
-ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
-CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
-CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
-
-
-WITH_PYTHON="@WITH_PYTHON@"
-PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-GLOG_LIBRARIES="@GLOG_LIBRARIES@"
-GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
-GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-
-CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@"
-WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
deleted file mode 100644
index ad5dce209bf8e14120320a58c3cd85d6f6a97688..0000000000000000000000000000000000000000
--- a/paddle/api/paddle_ld_flags.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    from paddle_api_config import *
-    import os.path
-    import platform
-
-    system = platform.system().lower()
-    is_osx = (system == 'darwin')
-    is_win = (system == 'windows')
-    is_lin = (system == 'linux')
-
-    if is_lin:
-        whole_start = "-Wl,--whole-archive"
-        whole_end = "-Wl,--no-whole-archive"
-    elif is_osx:
-        whole_start = ""
-        whole_end = ""
-
-    LIB_DIRS = [
-        "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
-        "pserver", "trainer"
-    ]
-    PARENT_LIB_DIRS = ['proto']
-
-    class PaddleLDFlag(object):
-        def __init__(self):
-            self.paddle_build_dir = PADDLE_BUILD_DIR
-            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
-            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIBRARY
-            self.zlib = ZLIB_LIBRARIES
-            self.thread = CMAKE_THREAD_LIB
-            self.dl_libs = CMAKE_DL_LIBS
-            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
-            self.python_libs = PYTHON_LIBRARIES
-
-            self.glog_libs = GLOG_LIBRARIES
-
-            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
-            self.gflags_libs = GFLAGS_LIBRARIES
-            self.gflags_location = GFLAGS_LOCATION
-            self.cblas_libs = CBLAS_LIBRARIES
-            self.curt = CUDA_LIBRARIES
-
-        def ldflag_str(self):
-            return " ".join(
-                [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
-
-        def libs_dir_str(self):
-            libdirs = LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
-                    libdirs))
-
-        def parent_dir_str(self):
-            libdirs = PARENT_LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
-                    libdirs))
-
-        def libs_str(self):
-            libs = [
-                whole_start,
-                "-lpaddle_gserver",
-                "-lpaddle_function",
-                whole_end,
-                "-lpaddle_pserver",
-                "-lpaddle_trainer_lib",
-                "-lpaddle_network",
-                '-lpaddle_parameter',
-                "-lpaddle_math",
-                '-lpaddle_utils',
-                "-lpaddle_proto",
-                "-lpaddle_cuda",
-                "-lpaddle_api",
-                self.normalize_flag(self.protolib),
-                self.normalize_flag(self.glog_libs),
-                self.normalize_flag(self.gflags_libs),
-                self.normalize_flag(self.zlib),
-                self.normalize_flag(self.thread),
-                self.normalize_flag(self.dl_libs),
-                self.normalize_flag(self.cblas_libs),
-            ]
-
-            if self.with_python:
-                libs.append(self.normalize_flag(self.python_libs))
-            if self.with_gpu:
-                libs.append(self.normalize_flag(self.curt))
-            if self.with_coverage:
-                libs.append("-fprofile-arcs")
-            return " ".join(filter(lambda l: len(l) != 0, libs))
-
-        def normalize_flag(self, cmake_flag):
-            """
-            CMake flag string to ld flag
-            :type cmake_flag: str
-            """
-            if ";" in cmake_flag:
-                return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
-            if cmake_flag.startswith("/"):  # is a path
-                return cmake_flag
-            elif cmake_flag.startswith("-l"):  # normal link command
-                return cmake_flag
-            elif cmake_flag in [
-                    "gflags-shared", "gflags-static", "gflags_nothreads-shared",
-                    "gflags_nothreads-static"
-            ]:  # special for gflags
-                assert PaddleLDFlag.cmake_bool(self.gflags_location)
-                return self.gflags_location
-            elif len(cmake_flag) != 0:
-                return "".join(["-l", cmake_flag])
-            else:
-                return ""
-
-        @staticmethod
-        def cmake_bool(cmake_str):
-            """
-            CMake bool string to bool
-            :param cmake_str: cmake boolean string
-            :type cmake_str: str
-            :rtype: bool
-            """
-            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
-                    "-NOTFOUND"):
-                return False
-            else:
-                return True
-
-        def c_flag(self):
-            if self.with_coverage:
-                return [
-                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
-                    "-std=c++11"
-                ]
-            else:
-                return ["-std=c++11"]
-except ImportError:
-
-    class PaddleLDFlag(object):
-        def ldflag_str(self):
-            pass
-
-        def c_flag(self):
-            pass
diff --git a/paddle/api/test/.gitignore b/paddle/api/test/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b7948824a1eab119140dd9bea20276c303fe4af1
--- /dev/null
+++ b/paddle/api/test/.gitignore
@@ -0,0 +1,2 @@
+*.w0
+*.wbias
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index a2fa623c80087d42e6a2a5c05f62eba4997f8ec4..f3b1c2c4d438b5d3e776ef27ce8f8b78f710f2ab 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,2 @@
-add_test(NAME test_swig_api
-    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh ${PYTHON_EXECUTABLE})
+add_python_test(test_swig_api
+    testArguments.py testGradientMachine.py testMatrix.py testVector.py testTrain.py testTrainer.py)
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 8cabecd242fb4eb98c0fe468687ef179245e4535..9fe44de94ea6ddb71d2dfbb2243fc86ede0d0531 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,6 +22,8 @@ class TestArguments(unittest.TestCase):
         args = swig_paddle.Arguments.createArguments(1)
         args.setSlotValue(0, m)
 
+        self.assertAlmostEqual(27.0, args.sum())
+
         mat = args.getSlotValue(0)
         assert isinstance(mat, swig_paddle.Matrix)
         np_mat = mat.toNumpyMatInplace()
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index b81eafa9673ca34f1b7e06401098d55bdb1b35a5..4b705f66eccd267f326fe0662a17b33a09fda982 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -45,6 +45,7 @@ class TestGradientMachine(unittest.TestCase):
             assert isinstance(val, swig_paddle.Vector)
             arr = numpy.full((len(val), ), 0.1, dtype="float32")
             val.copyFromNumpyArray(arr)
+            self.assertTrue(param.save(param.getName()))
             param_config = param.getConfig().toProto()
             assert isinstance(param_config,
                               paddle.proto.ParameterConfig_pb2.ParameterConfig)
@@ -92,6 +93,9 @@ class TestGradientMachine(unittest.TestCase):
 
         self.assertTrue(self.isCalled)
 
+        for param in machine.getParameters():
+            self.assertTrue(param.load(param.getName()))
+
     def test_train_one_pass(self):
         conf_file_path = './testTrainConfig.py'
         trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 37666bdccc9aedfe8f8079124129aad2ade53a43..f08fbf3ccdf5d7c0a5c739868b1bcb516146c23d 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):
 
     def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                          numpy_mat.shape)
 
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index a90d15c272a3a2b56e35c979e053deb2b54eebc1..7061a4c43bf01158b5f084d0c310dedd81773a04 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -89,9 +89,14 @@ def main():
             except Exception as e:
                 print e
 
+        ev = m.makeEvaluator()
+        ev.start()
         m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
                           update_callback)
-
+        m.eval(ev)
+        ev.finish()
+        for name in ev.getNames():
+            print name, ev.getValue(name)
         for optimizer in optimizers:
             optimizer.finishBatch()
 
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 1ab095c1d3d0d2c84d2d2f95a03f172b901de209..6339cf8542607bdda99eb9ccaa8b06480f144b78 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):
 
     def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
         vec[4] = 832
         for i in xrange(len(iv)):
@@ -106,7 +106,7 @@ class TestVector(unittest.TestCase):
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
         assert isinstance(vec, swig_paddle.Vector)
         numpy_arr[0] = 0.1
         for n, v in zip(numpy_arr, vec):
diff --git a/paddle/capi/Arguments.cpp b/paddle/capi/Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b81ec69e60399af86f055d2258276ac06e0b13a
--- /dev/null
+++ b/paddle/capi/Arguments.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "arguments.h"
+#include "capi_private.h"
+
+using paddle::capi::cast;
+
+#define castArg(v) cast<paddle::capi::CArguments>(v)
+#define castIVec(v) cast<paddle::capi::CIVector>(v)
+
+extern "C" {
+paddle_arguments paddle_arguments_create_none() {
+  return new paddle::capi::CArguments();
+}
+
+paddle_error paddle_arguments_destroy(paddle_arguments args) {
+  if (args == nullptr) return kPD_NULLPTR;
+  delete castArg(args);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_size(paddle_arguments args, uint64_t* size) {
+  if (args == nullptr || size == nullptr) return kPD_NULLPTR;
+  *size = castArg(args)->args.size();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_resize(paddle_arguments args, uint64_t size) {
+  if (args == nullptr) return kPD_NULLPTR;
+  castArg(args)->args.resize(size);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_value(paddle_arguments args,
+                                        uint64_t ID,
+                                        paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  if (m->mat == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].value = m->mat;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_value(paddle_arguments args,
+                                        uint64_t ID,
+                                        paddle_matrix mat) {
+  if (args == nullptr || mat == nullptr) return kPD_NULLPTR;
+  auto m = paddle::capi::cast<paddle::capi::CMatrix>(mat);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  m->mat = a->args[ID].value;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_get_ids(paddle_arguments args,
+                                      uint64_t ID,
+                                      paddle_ivector ids) {
+  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
+  auto iv = castIVec(ids);
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  iv->vec = a->args[ID].ids;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_ids(paddle_arguments args,
+                                      uint64_t ID,
+                                      paddle_ivector ids) {
+  //! TODO(lizhao): Complete this method.
+  if (args == nullptr || ids == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(ids);
+  if (iv->vec == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].ids = iv->vec;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint32_t nestedLevel,
+                                                     paddle_ivector seqPos) {
+  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
+  if (iv->vec == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
+    ptr = std::make_shared<paddle::ICpuGpuVector>(iv->vec);
+  });
+}
+
+paddle_error paddle_arguments_get_sequence_start_pos(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint32_t nestedLevel,
+                                                     paddle_ivector seqPos) {
+  if (args == nullptr || seqPos == nullptr) return kPD_NULLPTR;
+  auto iv = paddle::capi::cast<paddle::capi::CIVector>(seqPos);
+  auto a = castArg(args);
+  return a->accessSeqPos(ID, nestedLevel, [&iv](paddle::ICpuGpuVectorPtr& ptr) {
+    iv->vec = ptr->getMutableVector(false);
+  });
+}
+}
diff --git a/paddle/capi/CMakeLists.txt b/paddle/capi/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..1b52a79cebb1210b09fc9f30282bfd799a35dcf9
--- /dev/null
+++ b/paddle/capi/CMakeLists.txt
@@ -0,0 +1,73 @@
+if (WITH_DOUBLE)
+  set(PADDLE_FLOAT_TYPE double)
+else ()
+  set(PADDLE_FLOAT_TYPE float)
+endif()
+
+# config.h used for C-API. It will store Paddle building configuration as a
+# header. Make user just include PaddleCAPI.h then can get building
+# configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their
+# libraries.
+configure_file(config.h.in config.h @ONLY)
+
+# PaddleCAPI.h is the only header we exposed. It currently only used for model
+# inference.
+file(GLOB CAPI_HEADERS *.h)
+set(CAPI_PRIVATE_HEADER capi_private.h)
+list(REMOVE_ITEM CAPI_HEADERS ${CAPI_PRIVATE_HEADER})
+file(GLOB CAPI_SOURCES *.cpp)
+
+# building paddle_capi
+add_library(paddle_capi STATIC ${CAPI_HEADERS} ${CAPI_PRIVATE_HEADER}
+  ${CAPI_SOURCES})
+
+target_include_directories(paddle_capi PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+
+add_style_check_target(paddle_capi ${CAPI_SOURCES} ${CAPI_HEADER}
+  ${CAPI_PRIVATE_HEADER})
+
+add_dependencies(paddle_capi gen_proto_cpp)
+
+
+# combine all paddle static libraries together, into libpaddle_capi_whole.a
+# user should use PaddleCAPI as -lpaddle_capi_whole
+set(capi_whole_library libpaddle_capi_whole.a)
+add_custom_target(paddle_capi_whole ALL
+        COMMAND mkdir -p o_files/capi && cd o_files/capi/ && ar -x $<TARGET_FILE:paddle_capi>
+        COMMAND mkdir -p o_files/utils && cd o_files/utils/ && ar -x $<TARGET_FILE:paddle_utils>
+        COMMAND mkdir -p o_files/parameter && cd o_files/parameter/ && ar -x $<TARGET_FILE:paddle_parameter>
+        COMMAND mkdir -p o_files/math && cd o_files/math/  && ar -x $<TARGET_FILE:paddle_math>
+        COMMAND mkdir -p o_files/cuda && cd o_files/cuda/ && ar -x $<TARGET_FILE:paddle_cuda>
+        COMMAND mkdir -p o_files/function && cd o_files/function/ && ar -x $<TARGET_FILE:paddle_function>
+        COMMAND mkdir -p o_files/gserver && cd o_files/gserver/ && ar -x $<TARGET_FILE:paddle_gserver>
+        COMMAND mkdir -p o_files/proto && cd o_files/proto/ && ar -x $<TARGET_FILE:paddle_proto>
+        COMMAND mkdir -p o_files/network && cd o_files/network/ && ar -x $<TARGET_FILE:paddle_network>
+        COMMAND mkdir -p o_files/pserver && cd o_files/pserver/ && ar -x $<TARGET_FILE:paddle_pserver>
+        COMMAND ar crs ${capi_whole_library} `find ./o_files -name '*.o'`
+        COMMAND rm -rf o_files
+        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+        DEPENDS paddle_capi paddle_utils paddle_parameter paddle_math
+                paddle_cuda paddle_function paddle_gserver
+                paddle_proto paddle_pserver paddle_network
+        )
+set_target_properties(paddle_capi_whole
+  PROPERTIES IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library})
+
+add_library(paddle_capi_shared SHARED ${CAPI_SOURCES})
+target_include_directories(paddle_capi_shared PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
+link_paddle_exe(paddle_capi_shared)
+
+# install library & headers.
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${capi_whole_library} DESTINATION lib)
+install(FILES ${CAPI_HEADERS} DESTINATION include/paddle)
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/config.h DESTINATION include/paddle)
+install(TARGETS paddle_capi_shared DESTINATION lib)
+
+# this variable used for unittest
+set(PADDLE_CAPI_INC_PATH
+  ${CMAKE_CURRENT_BINARY_DIR}
+  ${CMAKE_CURRENT_SOURCE_DIR})
+
+if (WITH_TESTING)
+  add_subdirectory(tests)
+endif()
diff --git a/paddle/capi/Main.cpp b/paddle/capi/Main.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..78c43949dfe325d0e1a6ba10ae51cb7b858f6c52
--- /dev/null
+++ b/paddle/capi/Main.cpp
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <fenv.h>
+#include <stdlib.h>
+#include <string.h>
+#include <vector>
+#include "capi_private.h"
+#include "main.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/PythonUtil.h"
+
+static void initPaddle(int argc, char** argv) {
+  paddle::initMain(argc, argv);
+  paddle::initPython(argc, argv);
+}
+
+extern "C" {
+paddle_error paddle_init(int argc, char** argv) {
+  std::vector<char*> realArgv;
+  realArgv.reserve(argc + 1);
+  realArgv.push_back(strdup(""));
+  for (int i = 0; i < argc; ++i) {
+    realArgv.push_back(argv[i]);
+  }
+  initPaddle(argc + 1, realArgv.data());
+  free(realArgv[0]);
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d898ebe2612d749ca261d35139d1cd45bd355eef
--- /dev/null
+++ b/paddle/capi/Matrix.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "hl_cuda.h"
+#include "matrix.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CMatrix>(v)
+extern "C" {
+paddle_matrix paddle_matrix_create(uint64_t height,
+                                   uint64_t width,
+                                   bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::create(height, width, false, useGpu);
+  return ptr;
+}
+
+paddle_matrix paddle_matrix_create_none() {
+  return new paddle::capi::CMatrix();
+}
+
+paddle_error paddle_matrix_destroy(paddle_matrix mat) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  delete ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real* rowArray) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  paddle::real* buf = ptr->mat->getRowBuf(rowID);
+  size_t width = ptr->mat->getWidth();
+#ifndef PADDLE_ONLY_CPU
+  hl_memcpy(buf, rowArray, sizeof(paddle::real) * width);
+#else
+  std::copy(rowArray, rowArray + width, buf);
+#endif
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                   uint64_t rowID,
+                                   paddle_real** rawRowBuffer) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (ptr->mat == nullptr) return kPD_NULLPTR;
+  if (rowID >= ptr->mat->getHeight()) return kPD_OUT_OF_RANGE;
+  *rawRowBuffer = ptr->mat->getRowBuf(rowID);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                     uint64_t* height,
+                                     uint64_t* width) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  if (height != nullptr) {
+    *height = cast(mat)->mat->getHeight();
+  }
+  if (width != nullptr) {
+    *width = cast(mat)->mat->getWidth();
+  }
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu) {
+  auto ptr = new paddle::capi::CMatrix();
+  ptr->mat = paddle::Matrix::createSparseMatrix(
+      height,
+      width,
+      nnz,
+      isBinary ? paddle::NO_VALUE : paddle::FLOAT_VALUE,
+      paddle::SPARSE_CSR,
+      false,
+      useGpu);
+  return ptr;
+}
+
+paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                            int* rowArray,
+                                            uint64_t rowSize,
+                                            int* colArray,
+                                            uint64_t colSize,
+                                            float* valueArray,
+                                            uint64_t valueSize) {
+  if (mat == nullptr) return kPD_NULLPTR;
+  auto ptr = cast(mat);
+  if (rowArray == nullptr || colArray == nullptr ||
+      (valueSize != 0 && valueArray == nullptr) || ptr->mat == nullptr) {
+    return kPD_NULLPTR;
+  }
+  if (auto sparseMat = dynamic_cast<paddle::CpuSparseMatrix*>(ptr->mat.get())) {
+    std::vector<int> row(rowSize);
+    row.assign(rowArray, rowArray + rowSize);
+    std::vector<int> col(colSize);
+    col.assign(colArray, colArray + colSize);
+    std::vector<paddle_real> val(valueSize);
+    if (valueSize) {
+      val.assign(valueArray, valueArray + valueSize);
+    }
+    sparseMat->copyFrom(row, col, val);
+    return kPD_NO_ERROR;
+  } else {
+    return kPD_NOT_SUPPORTED;
+  }
+}
diff --git a/paddle/capi/Vector.cpp b/paddle/capi/Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..564708e963b4068da074c1fcc9aac0fade0f65b9
--- /dev/null
+++ b/paddle/capi/Vector.cpp
@@ -0,0 +1,69 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi_private.h"
+#include "vector.h"
+
+using paddle::capi::cast;
+
+extern "C" {
+
+paddle_ivector paddle_ivector_create_none() {
+  return new paddle::capi::CIVector();
+}
+
+paddle_ivector paddle_ivector_create(int* array,
+                                     uint64_t size,
+                                     bool copy,
+                                     bool useGPU) {
+  auto ptr = new paddle::capi::CIVector();
+  if (copy) {
+    ptr->vec = paddle::IVector::create(size, useGPU);
+    ptr->vec->copyFrom(array, size);
+  } else {
+    ptr->vec = paddle::IVector::create(array, size, useGPU);
+  }
+  return ptr;
+}
+
+paddle_error paddle_ivector_destroy(paddle_ivector ivec) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  delete cast<paddle::capi::CIVector>(ivec);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer) {
+  if (ivec == nullptr || buffer == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  *buffer = v->vec->getData();
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  v->vec->resize(size);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_ivector_get_size(paddle_ivector ivec, uint64_t* size) {
+  if (ivec == nullptr) return kPD_NULLPTR;
+  auto v = cast<paddle::capi::CIVector>(ivec);
+  if (v->vec == nullptr) return kPD_NULLPTR;
+  *size = v->vec->getSize();
+  return kPD_NO_ERROR;
+}
+}
diff --git a/paddle/capi/arguments.h b/paddle/capi/arguments.h
new file mode 100644
index 0000000000000000000000000000000000000000..d71ea26a5d1aff130d974541532fda3b09bf6fe5
--- /dev/null
+++ b/paddle/capi/arguments.h
@@ -0,0 +1,145 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_ARGUMENTS_H__
+#define __PADDLE_CAPI_ARGUMENTS_H__
+
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+#include "matrix.h"
+#include "vector.h"
+
+/**
+ * Arguments functions. Each argument means layer output. Arguments means a
+ * array of arguemnt.
+ */
+typedef void* paddle_arguments;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief paddle_arguments_create_none Create a array of arguments, which size
+ * is zero.
+ * @return Arguemnts
+ */
+PD_API paddle_arguments paddle_arguments_create_none();
+
+/**
+ * @brief paddle_arguments_destroy Destroy the arguments
+ * @param args arguments to destroy
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_destroy(paddle_arguments args);
+
+/**
+ * @brief paddle_arguments_get_size Get size of arguments array
+ * @param [in] args arguments array
+ * @param [out] size array size
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_size(paddle_arguments args,
+                                              uint64_t* size);
+
+/**
+ * @brief PDArgsResize Resize a arguments array.
+ * @param args arguments array.
+ * @param size target size of array
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_resize(paddle_arguments args,
+                                            uint64_t size);
+
+/**
+ * @brief PDArgsSetValue Set value matrix of one argument in array, which index
+ *        is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_value(paddle_arguments args,
+                                               uint64_t ID,
+                                               paddle_matrix mat);
+
+/**
+ * @brief PDArgsGetValue Get value matrix of one argument in array, which index
+ *        is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] mat matrix pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_value(paddle_arguments args,
+                                               uint64_t ID,
+                                               paddle_matrix mat);
+
+/**
+ * @brief PDArgsGetIds Get the integer vector of one argument in array, which
+ *        index is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param ids integer vector pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_get_ids(paddle_arguments args,
+                                             uint64_t ID,
+                                             paddle_ivector ids);
+
+/**
+ * @brief PDArgsSetIds Set the integer vector of one argument in array, which
+ *        index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] ids integer vector pointer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
+                                             uint64_t ID,
+                                             paddle_ivector ids);
+
+/**
+ * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
+ *        argument in array, which index is `ID`.
+ * @param args arguments array
+ * @param ID array index
+ * @param seqPos sequence position array.
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_arguments_set_sequence_start_pos(paddle_arguments args,
+                                        uint64_t ID,
+                                        uint32_t nestedLevel,
+                                        paddle_ivector seqPos);
+/**
+ * @brief PDArgsGetSequenceStartPos Get sequence start position vector of one
+ *        argument in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [out] seqPos sequence position array
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_arguments_get_sequence_start_pos(paddle_arguments args,
+                                        uint64_t ID,
+                                        uint32_t nestedLevel,
+                                        paddle_ivector seqPos);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/capi/capi.h b/paddle/capi/capi.h
new file mode 100644
index 0000000000000000000000000000000000000000..4097a1a35a64347f0d79b004371df26551e51bbe
--- /dev/null
+++ b/paddle/capi/capi.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_H__
+#define __PADDLE_CAPI_H__
+
+/**
+ * Paddle C API. It will replace SWIG as Multiple Language API for model
+ * training & inference. Currently it is only used in model infernece.
+ *
+ * NOTE: This is an experimental API, it could be changed.
+ */
+#include "arguments.h"
+#include "config.h"
+#include "error.h"
+#include "gradient_machine.h"
+#include "main.h"
+#include "matrix.h"
+#include "vector.h"
+
+#endif  // PADDLECAPI_H_
diff --git a/paddle/capi/capi_private.h b/paddle/capi/capi_private.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7cdbd5f6f347150c02764a86f8ffb0c068e872e
--- /dev/null
+++ b/paddle/capi/capi_private.h
@@ -0,0 +1,82 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Argument.h"
+#pragma once
+
+namespace paddle {
+namespace capi {
+
+enum CType { kIVECTOR = 0, kMATRIX, kARGUMENTS, kGRADIENT_MACHINE };
+
+#define STRUCT_HEADER CType type;
+
+struct CHeader {
+  STRUCT_HEADER
+};
+
+struct CIVector {
+  STRUCT_HEADER
+  IVectorPtr vec;
+
+  CIVector() : type(kIVECTOR) {}
+};
+
+struct CMatrix {
+  STRUCT_HEADER
+  MatrixPtr mat;
+
+  CMatrix() : type(kMATRIX) {}
+};
+
+struct CArguments {
+  STRUCT_HEADER
+  std::vector<paddle::Argument> args;
+
+  CArguments() : type(kARGUMENTS) {}
+
+  template <typename T>
+  paddle_error accessSeqPos(uint64_t ID, uint32_t nestedLevel, T callback) {
+    if (ID >= args.size()) return kPD_OUT_OF_RANGE;
+    switch (nestedLevel) {
+      case 0:
+        callback(args[ID].sequenceStartPositions);
+        break;
+      case 1:
+        callback(args[ID].subSequenceStartPositions);
+        break;
+      default:
+        return kPD_OUT_OF_RANGE;
+    }
+    return kPD_NO_ERROR;
+  }
+};
+
+struct CGradientMachine {
+  STRUCT_HEADER
+  paddle::GradientMachinePtr machine;
+
+  CGradientMachine() : type(kGRADIENT_MACHINE) {}
+};
+
+template <typename T>
+inline T* cast(void* ptr) {
+  return reinterpret_cast<T*>(ptr);
+}
+}  // namespace capi
+}  // namespace paddle
diff --git a/paddle/capi/config.h.in b/paddle/capi/config.h.in
new file mode 100644
index 0000000000000000000000000000000000000000..d205307588eb60b2e11accb9f825391f7c1453f2
--- /dev/null
+++ b/paddle/capi/config.h.in
@@ -0,0 +1,10 @@
+#ifndef __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
+#define __PADDLE_PADDLE_CAPI_CONFIG_H_INCLUDED__
+
+typedef @PADDLE_FLOAT_TYPE@ paddle_real;
+
+// Since we only support linux and macos in compile, always use clang or
+// gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
+#define PD_API __attribute__((visibility("default")))
+
+#endif
diff --git a/paddle/capi/error.h b/paddle/capi/error.h
new file mode 100644
index 0000000000000000000000000000000000000000..44d8c2040d1aad698398089baeee6f13c3deeb55
--- /dev/null
+++ b/paddle/capi/error.h
@@ -0,0 +1,30 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_ERROR_H__
+#define __PADDLE_CAPI_ERROR_H__
+
+/**
+ * Error Type for Paddle API.
+ */
+typedef enum {
+  kPD_NO_ERROR = 0,
+  kPD_NULLPTR = 1,
+  kPD_OUT_OF_RANGE = 2,
+  kPD_PROTOBUF_ERROR = 3,
+  kPD_NOT_SUPPORTED = 4,
+  kPD_UNDEFINED_ERROR = -1,
+} paddle_error;
+
+#endif
diff --git a/paddle/capi/examples/.gitignore b/paddle/capi/examples/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2caa0a5a298d8cec0d996c3774b6f42060a0d41a
--- /dev/null
+++ b/paddle/capi/examples/.gitignore
@@ -0,0 +1,2 @@
+*.bin
+build-*
diff --git a/paddle/capi/examples/README.md b/paddle/capi/examples/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..14013e281ff50279473dfc4da46aaef4f8b7ea9a
--- /dev/null
+++ b/paddle/capi/examples/README.md
@@ -0,0 +1,3 @@
+# C-API Example Usage
+
+* [Model Inference](./model_inference/README.md)
diff --git a/paddle/capi/examples/model_inference/README.md b/paddle/capi/examples/model_inference/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..58e6c83140b5f33ddfd1f027b6624a26f842a2f8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/README.md
@@ -0,0 +1,42 @@
+# Use C-API for Model Inference
+
+There are several examples in this directory about how to use Paddle C-API for model inference.
+
+## Convert configuration file to protobuf binary.
+
+Firstly, the user should convert Paddle's model configuration file into a protobuf binary file. In each example directory, there is a file named `convert_protobin.sh`. It will convert `trainer_config.conf` into `trainer_config.bin`.
+
+The `convert_protobin.sh` is very simple, just invoke `dump_config` Python module to dump the binary file. The command line usages are:
+
+```bash
+python -m paddle.utils.dump_config YOUR_CONFIG_FILE 'CONFIG_EXTRA_ARGS' --binary > YOUR_CONFIG_FILE.bin
+```
+
+## Initialize paddle
+
+```c++
+char* argv[] = {"--use_gpu=False"};
+paddle_init(1, (char**)argv);
+```
+
+We must initialize global context before we invoke other interfaces in Paddle. The initialize commands just like the `paddle_trainer` command line arguments.  `paddle train --help`,  will show the list of arguments. The most important argument is `use_gpu` or not.
+
+## Load network and parameters
+
+```c
+paddle_gradient_machine machine;
+paddle_gradient_machine_create_for_inference(&machine, config_file_content, content_size));
+paddle_gradient_machine_load_parameter_from_disk(machine, "./some_where_to_params"));
+```
+
+The gradient machine is a Paddle concept, which represents a neural network can be forwarded and backward. We can create a gradient machine fo model inference, and load the parameter files from disk.
+
+Moreover, if we want to inference in multi-thread, we could create a thread local gradient machine which shared the same parameter by using `paddle_gradient_machine_create_shared_param` API. Please reference `multi_thread` as an example.
+
+## Create input
+
+The input of a neural network is an `arguments`. The examples in this directory will show how to construct different types of inputs for prediction. Please look at `dense`, `sparse_binary`, `sequence` for details.
+
+## Get inference
+
+After invoking `paddle_gradient_machine_forward`, we could get the output of the neural network.  The `value` matrix of output arguments will store the neural network output values. If the output is a `SoftmaxActivation`, the `value` matrix are the probabilities of each input samples. The height of output matrix is number of sample. The width is the number of categories.
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..a78522e4a7c3cb34b341b7f4c89b53d32b72f114
--- /dev/null
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -0,0 +1,26 @@
+#ifndef __CAPI_EXAMPLE_COMMON_H__
+#define __CAPI_EXAMPLE_COMMON_H__
+#include <stdio.h>
+#include <stdlib.h>
+
+#define CHECK(stmt)                                                \
+  do {                                                             \
+    paddle_error __err__ = stmt;                                   \
+    if (__err__ != kPD_NO_ERROR) {                                 \
+      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
+      exit(__err__);                                               \
+    }                                                              \
+  } while (0)
+
+void* read_config(const char* filename, long* size) {
+  FILE* file = fopen(filename, "r");
+  if (file == NULL) return NULL;
+  fseek(file, 0L, SEEK_END);
+  *size = ftell(file);
+  fseek(file, 0L, SEEK_SET);
+  void* buf = malloc(*size);
+  fread(buf, 1, *size, file);
+  fclose(file);
+  return buf;
+}
+#endif
diff --git a/paddle/capi/examples/model_inference/dense/CMakeLists.txt b/paddle/capi/examples/model_inference/dense/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..008a488fd9e6fdca2c4cb92bf1b8c41fce1835a9
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(dense)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/dense/convert_protobin.sh b/paddle/capi/examples/model_inference/dense/convert_protobin.sh
new file mode 100755
index 0000000000000000000000000000000000000000..30ffc316ecb76cd9c8e2b628f85484a990ac6da8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/convert_protobin.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+python -m paddle.utils.dump_config trainer_config.py '' --binary > trainer_config.bin
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..3e6bd5285058a297c4574631e2a5c033b83936e8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -0,0 +1,69 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ false);
+  srand(time(0));
+  paddle_real* array;
+
+  // Get First row.
+  CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+  for (int i = 0; i < 784; ++i) {
+    array[i] = rand() / ((float)RAND_MAX);
+  }
+
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 10; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/dense/trainer_config.py b/paddle/capi/examples/model_inference/dense/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..873ec119e7a3d4debe50af2ba259ace50b0cbf7c
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/trainer_config.py
@@ -0,0 +1,18 @@
+from paddle.trainer_config_helpers import *
+
+img = data_layer(name='pixel', size=784)
+
+hidden = fc_layer(
+    input=img,
+    size=200,
+    param_attr=ParamAttr(name='hidden.w'),
+    bias_attr=ParamAttr(name='hidden.b'))
+
+prob = fc_layer(
+    input=hidden,
+    size=10,
+    act=SoftmaxActivation(),
+    param_attr=ParamAttr(name='prob.w'),
+    bias_attr=ParamAttr(name='prob.b'))
+
+outputs(prob)
diff --git a/paddle/capi/examples/model_inference/multi_thread/.gitignore b/paddle/capi/examples/model_inference/multi_thread/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fab7372d796ea95c80d02df6caa7eb2b411a7ac1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..98e411ddc02a46034e8f6ceb00657622d998c9f3
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/CMakeLists.txt
@@ -0,0 +1,8 @@
+project(multi_thread)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+find_package (Threads)
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared
+  ${CMAKE_THREAD_LIBS_INIT})
diff --git a/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh b/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
new file mode 120000
index 0000000000000000000000000000000000000000..3c1b3533523cf1709720d11df7b8e311e0577fe7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/multi_thread/main.c b/paddle/capi/examples/model_inference/multi_thread/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..d7675cd80a52f752b1a8567dae34123978113831
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/main.c
@@ -0,0 +1,98 @@
+#include <paddle/capi.h>
+#include <pthread.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+#define NUM_THREAD 4
+#define NUM_ITER 1000
+
+pthread_mutex_t mutex;
+
+void* thread_main(void* gm_ptr) {
+  paddle_gradient_machine machine = (paddle_gradient_machine)(gm_ptr);
+  paddle_arguments in_args = paddle_arguments_create_none();
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
+                                           /* size */ 784,
+                                           /* useGPU */ false);
+  paddle_arguments out_args = paddle_arguments_create_none();
+  paddle_matrix prob = paddle_matrix_create_none();
+  for (int iter = 0; iter < NUM_ITER; ++iter) {
+    // There is only one input of this network.
+    CHECK(paddle_arguments_resize(in_args, 1));
+
+    paddle_real* array;
+
+    // Get First row.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    for (int i = 0; i < 784; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+    CHECK(paddle_gradient_machine_forward(machine,
+                                          in_args,
+                                          out_args,
+                                          /* isTrain */ false));
+
+    CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+    CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+    pthread_mutex_lock(&mutex);
+    printf("Prob: ");
+    for (int i = 0; i < 10; ++i) {
+      printf("%.2f ", array[i]);
+    }
+    printf("\n");
+    pthread_mutex_unlock(&mutex);
+  }
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+  return NULL;
+}
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  srand(time(0));
+  pthread_mutex_init(&mutex, NULL);
+
+  pthread_t threads[NUM_THREAD];
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    paddle_gradient_machine thread_local_machine;
+    CHECK(paddle_gradient_machine_create_shared_param(
+        machine, buf, size, &thread_local_machine));
+    pthread_create(&threads[i], NULL, thread_main, thread_local_machine);
+  }
+
+  for (int i = 0; i < NUM_THREAD; ++i) {
+    pthread_join(threads[i], NULL);
+  }
+
+  pthread_mutex_destroy(&mutex);
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/multi_thread/trainer_config.py b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
new file mode 120000
index 0000000000000000000000000000000000000000..70cfb1f7f4cfe9afa6ccbd6f2f419aa286970bbe
--- /dev/null
+++ b/paddle/capi/examples/model_inference/multi_thread/trainer_config.py
@@ -0,0 +1 @@
+../dense/trainer_config.py
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sequence/.gitignore b/paddle/capi/examples/model_inference/sequence/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fab7372d796ea95c80d02df6caa7eb2b411a7ac1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/sequence/CMakeLists.txt b/paddle/capi/examples/model_inference/sequence/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..71b73acba7cdea1c869ec6061df379c3f7cb45db
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/CMakeLists.txt
@@ -0,0 +1,6 @@
+project(sequence)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/sequence/convert_protobin.sh b/paddle/capi/examples/model_inference/sequence/convert_protobin.sh
new file mode 120000
index 0000000000000000000000000000000000000000..3c1b3533523cf1709720d11df7b8e311e0577fe7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sequence/main.c b/paddle/capi/examples/model_inference/sequence/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..50bc0c9201f207eff7389bfbee3bc2e43261b19a
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/main.c
@@ -0,0 +1,70 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input ids.
+  int sentence_ids[] = {83, 48, 20, 84, 394, 853, 64, 53, 64};
+
+  paddle_ivector sentence = paddle_ivector_create(
+      sentence_ids, sizeof(sentence_ids) / sizeof(int), false, false);
+  CHECK(paddle_arguments_set_ids(in_args, 0, sentence));
+
+  int seq_pos_array[] = {0, sizeof(sentence_ids) / sizeof(int)};
+
+  paddle_ivector seq_pos = paddle_ivector_create(
+      seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+
+  CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  paddle_real* array;
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 2; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_ivector_destroy(seq_pos));
+  CHECK(paddle_ivector_destroy(sentence));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sequence/trainer_config.py b/paddle/capi/examples/model_inference/sequence/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bbc7a909aa03950ce621efa43fa47d9cdd016f8
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sequence/trainer_config.py
@@ -0,0 +1,13 @@
+from paddle.trainer_config_helpers import *
+
+WORD_DIM = 3000
+
+sentence = data_layer(name='sentence', size=WORD_DIM)
+sentence_embedding = embedding_layer(
+    input=sentence,
+    size=64,
+    param_attr=ParameterAttribute(
+        initial_max=1.0, initial_min=0.5))
+lstm = simple_lstm(input=sentence_embedding, size=64)
+lstm_last = last_seq(input=lstm)
+outputs(fc_layer(input=lstm_last, size=2, act=SoftmaxActivation()))
diff --git a/paddle/capi/examples/model_inference/sparse_binary/.gitignore b/paddle/capi/examples/model_inference/sparse_binary/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..fab7372d796ea95c80d02df6caa7eb2b411a7ac1
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/.gitignore
@@ -0,0 +1,73 @@
+# This file is used to ignore files which are generated
+# ----------------------------------------------------------------------------
+
+*~
+*.autosave
+*.a
+*.core
+*.moc
+*.o
+*.obj
+*.orig
+*.rej
+*.so
+*.so.*
+*_pch.h.cpp
+*_resource.rc
+*.qm
+.#*
+*.*#
+core
+!core/
+tags
+.DS_Store
+.directory
+*.debug
+Makefile*
+*.prl
+*.app
+moc_*.cpp
+ui_*.h
+qrc_*.cpp
+Thumbs.db
+*.res
+*.rc
+/.qmake.cache
+/.qmake.stash
+
+# qtcreator generated files
+*.pro.user*
+
+# xemacs temporary files
+*.flc
+
+# Vim temporary files
+.*.swp
+
+# Visual Studio generated files
+*.ib_pdb_index
+*.idb
+*.ilk
+*.pdb
+*.sln
+*.suo
+*.vcproj
+*vcproj.*.*.user
+*.ncb
+*.sdf
+*.opensdf
+*.vcxproj
+*vcxproj.*
+
+# MinGW generated files
+*.Debug
+*.Release
+
+# Python byte code
+*.pyc
+
+# Binaries
+# --------
+*.dll
+*.exe
+
diff --git a/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt b/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c82195688902ac70346fd5204fb14e28886fb51f
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/CMakeLists.txt
@@ -0,0 +1,7 @@
+project(sparse_binary)
+cmake_minimum_required(VERSION 2.8)
+aux_source_directory(. SRC_LIST)
+add_executable(${PROJECT_NAME} ${SRC_LIST})
+find_package (Threads)
+set_property(TARGET ${PROJECT_NAME} PROPERTY C_STANDARD 99)
+target_link_libraries(${PROJECT_NAME} -lpaddle_capi_shared)
diff --git a/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh b/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
new file mode 120000
index 0000000000000000000000000000000000000000..3c1b3533523cf1709720d11df7b8e311e0577fe7
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/convert_protobin.sh
@@ -0,0 +1 @@
+../dense/convert_protobin.sh
\ No newline at end of file
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..8ba67aee560239d3050c7f40198d20df99ec370e
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -0,0 +1,70 @@
+#include <paddle/capi.h>
+#include <time.h>
+#include "../common/common.h"
+
+#define CONFIG_BIN "./trainer_config.bin"
+
+int main() {
+  // Initalize Paddle
+  char* argv[] = {"--use_gpu=False"};
+  CHECK(paddle_init(1, (char**)argv));
+
+  // Reading config binary file. It is generated by `convert_protobin.sh`
+  long size;
+  void* buf = read_config(CONFIG_BIN, &size);
+
+  // Create a gradient machine for inference.
+  paddle_gradient_machine machine;
+  CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
+  CHECK(paddle_gradient_machine_randomize_param(machine));
+
+  // Loading parameter. Uncomment the following line and change the directory.
+  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
+  //                                                "./some_where_to_params"));
+  paddle_arguments in_args = paddle_arguments_create_none();
+
+  // There is only one input of this network.
+  CHECK(paddle_arguments_resize(in_args, 1));
+
+  // Create input matrix.
+  paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
+  srand(time(0));
+  paddle_real* array;
+  int colBuf[] = {9, 93, 109};
+  int rowBuf[] = {0, sizeof(colBuf) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                       rowBuf,
+                                       sizeof(rowBuf) / sizeof(int),
+                                       colBuf,
+                                       sizeof(colBuf) / sizeof(int),
+                                       NULL,
+                                       0));
+
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+
+  paddle_arguments out_args = paddle_arguments_create_none();
+  CHECK(paddle_gradient_machine_forward(machine,
+                                        in_args,
+                                        out_args,
+                                        /* isTrain */ false));
+  paddle_matrix prob = paddle_matrix_create_none();
+
+  CHECK(paddle_arguments_get_value(out_args, 0, prob));
+
+  CHECK(paddle_matrix_get_row(prob, 0, &array));
+
+  printf("Prob: ");
+  for (int i = 0; i < 10; ++i) {
+    printf("%.2f ", array[i]);
+  }
+  printf("\n");
+
+  CHECK(paddle_matrix_destroy(prob));
+  CHECK(paddle_arguments_destroy(out_args));
+  CHECK(paddle_matrix_destroy(mat));
+  CHECK(paddle_arguments_destroy(in_args));
+  CHECK(paddle_gradient_machine_destroy(machine));
+
+  return 0;
+}
diff --git a/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
new file mode 120000
index 0000000000000000000000000000000000000000..70cfb1f7f4cfe9afa6ccbd6f2f419aa286970bbe
--- /dev/null
+++ b/paddle/capi/examples/model_inference/sparse_binary/trainer_config.py
@@ -0,0 +1 @@
+../dense/trainer_config.py
\ No newline at end of file
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00f76e0152366834eafc22df710cf3d6c7b8471f
--- /dev/null
+++ b/paddle/capi/gradient_machine.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gradient_machine.h"
+#include "capi_private.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+
+#define cast(v) paddle::capi::cast<paddle::capi::CGradientMachine>(v)
+
+enum GradientMatchineCreateMode {
+  CREATE_MODE_NORMAL = 0,
+  CREATE_MODE_TESTING = 4
+};
+
+namespace paddle {
+
+class MyNeuralNetwork : public NeuralNetwork {
+public:
+  MyNeuralNetwork(const std::string& name, NeuralNetwork* network)
+      : NeuralNetwork(name, network) {}
+};
+
+NeuralNetwork* newCustomNerualNetwork(const std::string& name,
+                                      NeuralNetwork* network) {
+  return new MyNeuralNetwork(name, network);
+}
+}  // namespace paddle
+
+extern "C" {
+paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size) {
+  if (modelConfigProtobuf == nullptr) return kPD_NULLPTR;
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config, CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
+  delete cast(machine);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path) {
+  auto m = cast(machine);
+  if (m == nullptr || path == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->loadParameters(path);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                             paddle_arguments inArgs,
+                                             paddle_arguments outArgs,
+                                             bool isTrain) {
+  auto m = cast(machine);
+  auto in = paddle::capi::cast<paddle::capi::CArguments>(inArgs);
+  auto out = paddle::capi::cast<paddle::capi::CArguments>(outArgs);
+  if (m == nullptr || in == nullptr || out == nullptr || m->machine == nullptr)
+    return kPD_NULLPTR;
+  m->machine->forward(
+      in->args, &out->args, isTrain ? paddle::PASS_TRAIN : paddle::PASS_TEST);
+  return kPD_NO_ERROR;
+}
+
+paddle_error paddle_gradient_machine_create_shared_param(
+    paddle_gradient_machine origin,
+    void* modelConfigProtobuf,
+    int size,
+    paddle_gradient_machine* slave) {
+  auto o = cast(origin);
+  if (origin == nullptr || slave == nullptr || o->machine == nullptr) {
+    return kPD_NULLPTR;
+  }
+  paddle::ModelConfig config;
+  if (!config.ParseFromArray(modelConfigProtobuf, size) ||
+      !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+
+  std::unique_ptr<paddle::capi::CGradientMachine> ptr(
+      new paddle::capi::CGradientMachine());
+  auto nn = paddle::NeuralNetwork::create(config);
+  nn->init(config,
+           [&o](int paramId, paddle::Parameter* param) {
+             auto p = o->machine->getParameters()[paramId];
+             param->enableSharedType(paddle::PARAMETER_VALUE,
+                                     p->getBuf(paddle::PARAMETER_VALUE));
+           },
+           {paddle::PARAMETER_VALUE},
+           false);
+  ptr->machine.reset(nn);
+  *slave = ptr.release();
+  return kPD_NO_ERROR;
+}
+}
+
+paddle_error paddle_gradient_machine_randomize_param(
+    paddle_gradient_machine machine) {
+  auto m = cast(machine);
+  if (m == nullptr || m->machine == nullptr) return kPD_NULLPTR;
+  m->machine->randParameters();
+  return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
new file mode 100644
index 0000000000000000000000000000000000000000..d7e2dd9bf8037ed474971624d4518160604abe4d
--- /dev/null
+++ b/paddle/capi/gradient_machine.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_GRADIENT_MACHINE_H__
+#define __PADDLE_CAPI_GRADIENT_MACHINE_H__
+#include "arguments.h"
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief GradientMachine means a neural network.
+ */
+typedef void* paddle_gradient_machine;
+
+/**
+ * @brief Create a gradient machine used for model inference.
+ * @param [out] machine that used for model inference.
+ * @param [in] modelConfigProtobuf
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_gradient_machine_create_for_inference(
+    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
+
+/**
+ * @brief Load parameter from disk.
+ * @param machine Gradient Machine.
+ * @param path local directory path.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_gradient_machine_load_parameter_from_disk(
+    paddle_gradient_machine machine, const char* path);
+
+/**
+ * @brief Forward a gradient machine
+ * @param machine Gradient machine
+ * @param inArgs input arguments
+ * @param outArgs output arguments
+ * @param isTrain is train or not
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_forward(paddle_gradient_machine machine,
+                                paddle_arguments inArgs,
+                                paddle_arguments outArgs,
+                                bool isTrain);
+
+/**
+ * @brief Create a gradient machine, which parameters are shared from another
+ *        gradient machine.
+ * @param [in] origin gradient machine
+ * @param [in] modelConfigProtobuf model config protobuf
+ * @param [in] size of model config buffer.
+ * @param [out] slave gradient machine, the output value.
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_shared_param(paddle_gradient_machine origin,
+                                            void* modelConfigProtobuf,
+                                            int size,
+                                            paddle_gradient_machine* slave);
+
+PD_API paddle_error
+paddle_gradient_machine_randomize_param(paddle_gradient_machine machine);
+
+/**
+ * @brief Destroy a gradient machine
+ * @param machine that need to destroy
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_destroy(paddle_gradient_machine machine);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/paddle/capi/main.h b/paddle/capi/main.h
new file mode 100644
index 0000000000000000000000000000000000000000..893ebcbd58dd24cf835fb2005865c94c9ba2a810
--- /dev/null
+++ b/paddle/capi/main.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_MAIN_H__
+#define __PADDLE_CAPI_MAIN_H__
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Initialize Paddle.
+ */
+PD_API paddle_error paddle_init(int argc, char** argv);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..f15f7f3bbbd1457617111f827d2182ae6b7d9fdb
--- /dev/null
+++ b/paddle/capi/matrix.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_MATRIX_H__
+#define __PADDLE_CAPI_MATRIX_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Matrix functions. Return will be a paddle_error type.
+ */
+typedef void* paddle_matrix;
+
+/**
+ * @brief paddle_matrix_create Create a dense matrix
+ * @param height matrix height.
+ * @param width matrix width
+ * @param useGpu use GPU of not
+ * @return Matrix handler
+ */
+PD_API paddle_matrix paddle_matrix_create(uint64_t height,
+                                          uint64_t width,
+                                          bool useGpu);
+
+/**
+ * @brief paddle_matrix_create_sparse Create a sparse matrix.
+ * @param height the matrix height.
+ * @param width the matrix width.
+ * @param nnz the number of non-zero elements.
+ * @param isBinary is binary (either 1 or 0 in matrix) or not.
+ * @param useGpu is using GPU or not.
+ * @return paddle_matrix.
+ */
+PD_API paddle_matrix paddle_matrix_create_sparse(
+    uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+
+/**
+ * @brief paddle_matrix_destroy Destroy a matrix.
+ * @param mat
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_destroy(paddle_matrix mat);
+
+/**
+ * @brief paddle_matrix_set_row Set a row to matrix.
+ * @param mat Target Matrix
+ * @param rowID Index of row
+ * @param rowArray Row data.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
+                                          uint64_t rowID,
+                                          paddle_real* rowArray);
+
+/**
+ * @brief PDMatGetRow Get raw row buffer from matrix
+ * @param [in] mat Target matrix
+ * @param [in] rowID Index of row.
+ * @param [out] rawRowBuffer Row Buffer
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
+                                          uint64_t rowID,
+                                          paddle_real** rawRowBuffer);
+
+/**
+ * @brief PDMatCreateNone Create None Matrix
+ * @return
+ */
+PD_API paddle_matrix paddle_matrix_create_none();
+
+/**
+ * @brief PDMatGetShape get the shape of matrix
+ * @param mat target matrix
+ * @param height The height of matrix
+ * @param width The width of matrix
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_get_shape(paddle_matrix mat,
+                                            uint64_t* height,
+                                            uint64_t* width);
+
+/**
+ * @brief paddle_matrix_sparse_copy_from Copy from a CSR format matrix
+ * @param [out] mat output matrix
+ * @param [in] rowArray row array. The array slices in column array.
+ * @param [in] rowSize length of row array.
+ * @param [in] colArray the column array. It means the non-zero element indices
+ * in each row.
+ * @param [in] colSize length of column array.
+ * @param [in] valueArray the value array. It means the non-zero elemnt values.
+ * NULL if the matrix is binary.
+ * @param [in] valueSize length of value array. Zero if the matrix is binary.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_matrix_sparse_copy_from(paddle_matrix mat,
+                                                   int* rowArray,
+                                                   uint64_t rowSize,
+                                                   int* colArray,
+                                                   uint64_t colSize,
+                                                   float* valueArray,
+                                                   uint64_t valueSize);
+
+#ifdef __cplusplus
+}
+#endif
+#endif
diff --git a/paddle/capi/tests/.gitignore b/paddle/capi/tests/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..7ab6be95e397fa8f0339294a00c2f057bc116792
--- /dev/null
+++ b/paddle/capi/tests/.gitignore
@@ -0,0 +1,2 @@
+w
+b
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d73f6b7733950bd472a46afb21694aac943fc909
--- /dev/null
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_unittest(capi_test_mats test_Vector.cpp
+  test_Matrix.cpp test_Arguments.cpp)
+
+target_include_directories(capi_test_mats PUBLIC ${PADDLE_CAPI_INC_PATH})
+target_link_libraries(capi_test_mats paddle_capi)
+
+
+add_unittest_without_exec(capi_test_gradientMachine test_GradientMachine.cpp)
+target_include_directories(capi_test_gradientMachine PUBLIC
+  ${PADDLE_CAPI_INC_PATH})
+target_link_libraries(capi_test_gradientMachine paddle_capi)
+add_test(NAME capi_test_gradientMachine
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
diff --git a/paddle/capi/tests/test_Arguments.cpp b/paddle/capi/tests/test_Arguments.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4792ceb49a7816f47ebf9b653d7f34e08f4a85bf
--- /dev/null
+++ b/paddle/capi/tests/test_Arguments.cpp
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <functional>
+#include "capi.h"
+#include "gtest/gtest.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(CAPIArguments, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_arguments args = paddle_arguments_create_none();
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(args, &size));
+  ASSERT_EQ(0UL, size);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, value) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_matrix mat = paddle_matrix_create(128, 64, false);
+  for (size_t i = 0; i < 128; ++i) {
+    std::vector<paddle_real> sampleBuf = randomBuffer(64);
+    paddle_matrix_set_row(mat, i, sampleBuf.data());
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(args, 0, mat));
+
+  paddle_matrix val = paddle_matrix_create_none();
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(args, 0, val));
+
+  for (size_t i = 0; i < 128; ++i) {
+    paddle_real* row1;
+    paddle_real* row2;
+
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, i, &row1));
+    ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(val, i, &row2));
+    ASSERT_EQ(row1, row2);
+  }
+
+  paddle_ivector ivec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, ids) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_ids(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_ids(args, 0, val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+template <typename T1, typename T2>
+void testSequenceHelper(T1 setter, T2 getter) {
+  paddle_arguments args = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(args, 1));
+
+  paddle_ivector ivec;
+  int array[3] = {1, 2, 3};
+  ivec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, setter(args, 0, ivec));
+
+  paddle_ivector val = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, getter(args, 0, val));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(val, &size));
+
+  int* rawBuf;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get(val, &rawBuf));
+  for (size_t i = 0; i < size; ++i) {
+    ASSERT_EQ(array[i], rawBuf[i]);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(ivec));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(val));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(args));
+}
+
+TEST(CAPIArguments, Sequence) {
+  auto testSequence = [](uint32_t nestedLevel) {
+    testSequenceHelper(std::bind(paddle_arguments_set_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3),
+                       std::bind(paddle_arguments_get_sequence_start_pos,
+                                 std::placeholders::_1,
+                                 std::placeholders::_2,
+                                 nestedLevel,
+                                 std::placeholders::_3));
+  };
+  for (uint32_t i = 0; i < 2; ++i) {  // test seq and sub-seq.
+    testSequence(i);
+  }
+}
diff --git a/paddle/capi/tests/test_GradientMachine.cpp b/paddle/capi/tests/test_GradientMachine.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..89aa64608dd79ea8a8f5add724d9ea79e5abff16
--- /dev/null
+++ b/paddle/capi/tests/test_GradientMachine.cpp
@@ -0,0 +1,117 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/trainer/TrainerConfigHelper.h>
+#include <stdlib.h>
+#include <string.h>
+#include <type_traits>
+#include "capi.h"
+#include "paddle/utils/ThreadLocal.h"
+
+static std::vector<paddle_real> randomBuffer(size_t bufSize) {
+  auto& eng = paddle::ThreadLocalRandomEngine::get();
+  std::uniform_real_distribution<paddle_real> dist(-1.0, 1.0);
+  std::vector<paddle_real> retv;
+  retv.reserve(bufSize);
+  for (size_t i = 0; i < bufSize; ++i) {
+    retv.push_back(dist(eng));
+  }
+  return retv;
+}
+
+TEST(GradientMachine, testPredict) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle::TrainerConfigHelper config("./test_predict_network.py");
+  std::string buffer;
+  ASSERT_TRUE(config.getModelConfig().SerializeToString(&buffer));
+  paddle_gradient_machine machine;
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_for_inference(
+                &machine, &buffer[0], (int)buffer.size()));
+  std::unique_ptr<paddle::GradientMachine> gm(
+      paddle::GradientMachine::create(config.getModelConfig()));
+  ASSERT_NE(nullptr, gm);
+  gm->randParameters();
+  gm->saveParameters("./");
+
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_load_parameter_from_disk(machine, "./"));
+
+  paddle_gradient_machine machineSlave;
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_create_shared_param(
+                machine, &buffer[0], (int)buffer.size(), &machineSlave));
+  std::swap(machineSlave, machine);
+  paddle_arguments outArgs = paddle_arguments_create_none();
+
+  paddle_arguments inArgs = paddle_arguments_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_resize(inArgs, 1));
+  paddle_matrix mat = paddle_matrix_create(1, 100, false);
+  static_assert(std::is_same<paddle_real, paddle::real>::value, "");
+
+  auto data = randomBuffer(100);
+  paddle_real* rowPtr;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  memcpy(rowPtr, data.data(), data.size() * sizeof(paddle_real));
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_set_value(inArgs, 0, mat));
+  ASSERT_EQ(kPD_NO_ERROR,
+            paddle_gradient_machine_forward(machine, inArgs, outArgs, false));
+
+  uint64_t sz;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_size(outArgs, &sz));
+  ASSERT_EQ(1UL, sz);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_get_value(outArgs, 0, mat));
+  std::vector<paddle::Argument> paddleInArgs;
+  std::vector<paddle::Argument> paddleOutArgs;
+  paddleInArgs.resize(1);
+  paddleInArgs[0].value =
+      paddle::Matrix::create(data.data(), 1, 100, false, false);
+
+  gm->forward(paddleInArgs, &paddleOutArgs, paddle::PASS_TEST);
+
+  auto matPaddle = paddleOutArgs[0].value;
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(matPaddle->getHeight(), height);
+  ASSERT_EQ(matPaddle->getWidth(), width);
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &rowPtr));
+  for (size_t i = 0; i < width; ++i) {
+    ASSERT_NEAR(matPaddle->getData()[i], rowPtr[i], 1e-5);
+  }
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(inArgs));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_arguments_destroy(outArgs));
+  std::swap(machineSlave, machine);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machineSlave));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_gradient_machine_destroy(machine));
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  std::vector<char*> argvs;
+  argvs.push_back(strdup("--use_gpu=false"));
+  paddle_init((int)argvs.size(), argvs.data());
+  for (auto each : argvs) {
+    free(each);
+  }
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/capi/tests/test_Matrix.cpp b/paddle/capi/tests/test_Matrix.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4bf9a9d6a9f9161561e9e5612edd2c93cab7ac5b
--- /dev/null
+++ b/paddle/capi/tests/test_Matrix.cpp
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "gtest/gtest.h"
+
+TEST(CAPIMatrix, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_matrix mat = paddle_matrix_create(128, 32, false);
+  std::vector<paddle_real> sampleRow;
+  sampleRow.resize(32);
+  for (size_t i = 0; i < sampleRow.size(); ++i) {
+    sampleRow[i] = 1.0 / (i + 1.0);
+  }
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_set_row(mat, 0, sampleRow.data()));
+  ASSERT_EQ(kPD_OUT_OF_RANGE,
+            paddle_matrix_set_row(mat, 128, sampleRow.data()));
+
+  paddle_real* arrayPtr;
+
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_row(mat, 0, &arrayPtr));
+  for (size_t i = 0; i < sampleRow.size(); ++i) {
+    ASSERT_NEAR(sampleRow[i], arrayPtr[i], 1e-5);
+  }
+
+  uint64_t height, width;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_get_shape(mat, &height, &width));
+  ASSERT_EQ(128UL, height);
+  ASSERT_EQ(32UL, width);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
+
+TEST(CAPIMatrix, createNone) {
+  paddle_matrix mat = paddle_matrix_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_matrix_destroy(mat));
+}
diff --git a/paddle/capi/tests/test_Vector.cpp b/paddle/capi/tests/test_Vector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..365160dc9a08e6b6fc07fb685d5149d1e078da9b
--- /dev/null
+++ b/paddle/capi/tests/test_Vector.cpp
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "capi.h"
+#include "gtest/gtest.h"
+
+TEST(CAPIVector, create) {
+  //! TODO(yuyang18): Test GPU Code.
+  paddle_ivector vec;
+  int array[3] = {1, 2, 3};
+  vec = paddle_ivector_create(array, 3, true, false);
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_resize(vec, 1000));
+  uint64_t size;
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_get_size(vec, &size));
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
+}
+
+TEST(CAPIVector, createNone) {
+  paddle_ivector vec = paddle_ivector_create_none();
+  ASSERT_EQ(kPD_NO_ERROR, paddle_ivector_destroy(vec));
+}
diff --git a/paddle/capi/tests/test_predict_network.py b/paddle/capi/tests/test_predict_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..82ef5cb1a70398df65ace3c802076743c3ebe341
--- /dev/null
+++ b/paddle/capi/tests/test_predict_network.py
@@ -0,0 +1,13 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=100)
+
+x = data_layer(name='x', size=100)
+
+y = fc_layer(
+    input=x,
+    size=100,
+    bias_attr=ParamAttr(name='b'),
+    param_attr=ParamAttr(name='w'))
+
+outputs(y)
diff --git a/paddle/capi/vector.h b/paddle/capi/vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..a92aeff16425779bf63a7ffd7217709b6bf3cd05
--- /dev/null
+++ b/paddle/capi/vector.h
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef __PADDLE_CAPI_VECTOR_H__
+#define __PADDLE_CAPI_VECTOR_H__
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "config.h"
+#include "error.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * Int Vector Functions. Return will be a paddle_error type.
+ */
+typedef void* paddle_ivector;
+
+/**
+ * @brief Create an none int vector. It just a handler and store nothing. Used
+ *        to get output from other api.
+ * @return None int vector.
+ */
+PD_API paddle_ivector paddle_ivector_create_none();
+
+/**
+ * @brief paddle_ivector_create create a paddle int vector
+ * @param array: input array.
+ * @param size: input array size.
+ * @param copy: memory copy or just use same memory. True if copy.
+ * @param useGPU: True if use GPU
+ * @return paddle_error
+ */
+PD_API paddle_ivector paddle_ivector_create(int* array,
+                                            uint64_t size,
+                                            bool copy,
+                                            bool useGPU);
+
+/**
+ * @brief paddle_ivector_destroy destory an int vector.
+ * @param ivec vector to be destoried.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_destroy(paddle_ivector ivec);
+
+/**
+ * @brief paddle_ivector_get get raw buffer stored inside this int vector. It
+ * could be GPU memory if this int vector is stored in GPU.
+ * @param [in] ivec int vector
+ * @param [out] buffer the return buffer pointer.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_get(paddle_ivector ivec, int** buffer);
+
+/**
+ * @brief paddle_ivector_resize resize the int vector.
+ * @param [in] ivec: int vector
+ * @param [in] size: size to change
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_resize(paddle_ivector ivec, uint64_t size);
+
+/**
+ * @brief paddle_ivector_get_size get the size of int vector.
+ * @param [in] ivec: int vector
+ * @param [out] size: return size of this int vector.
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_ivector_get_size(paddle_ivector ivec,
+                                            uint64_t* size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index 57fb89608f4bcf3e6829fe850a61c2a626adfbdc..f9061e96deb659dcf7bfb88b46e6509af0425199 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,23 +15,19 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
 
 if(WITH_GPU)
     set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
         src/hl_warpctc_wrap.cc
         ${CUDA_CXX_WITH_GPU_SOURCES})
 
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
-    set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
-        src/hl_warpctc_wrap.cc)
+    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
 endif()
 
 set(CUDA_CU_SOURCES
@@ -48,7 +44,6 @@ set(CUDA_CU_SOURCES
 
 set(CUDA_HEADERS
     include/hl_time.h
-    include/hl_dso_loader.h
     include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index cdb2dba06cb4123da4be2088e290c6a740e0375b..93957fd9644652c103d15873b732d0b9fa89330f 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -40,18 +40,18 @@ public:
 namespace gpu {
 static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace gpu
 #else
 namespace cpu {
 static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace cpu
 
 #ifdef __AVX__
 namespace avx {
 static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace avx
 #endif
 #endif
 
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index c5787630abbe105af64888692b1106bd21f4c1e8..f55197c8c9ebb4a0f67ab915abfefd6a45cd13aa 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -273,23 +273,23 @@ extern void hl_bilinear_forward(const real* inData,
                                 const real ratioW);
 
 /**
-* @brief   Bilinear interpolation backward.
-*
-* @param[out]  inGrad      input gradient.
-* @param[in]   inImgH      input image height.
-* @param[in]   inImgW      input image width.
-* @param[in]   inputH      input batchSize.
-* @param[in]   inputW      input image data dim.
-* @param[in]   outGrad     output gradient.
-* @param[in]   outImgH     output image height.
-* @param[in]   outImgW     output image width.
-* @param[in]   outputH     output batchSize.
-* @param[in]   outputW     output image data dim.
-* @param[in]   numChannels number of channels.
-* @param[in]   ratioH      inImgH / outImgH.
-* @param[in]   ratioW      inImgW / outImgW.
-*
-*/
+ * @brief   Bilinear interpolation backward.
+ *
+ * @param[out]  inGrad      input gradient.
+ * @param[in]   inImgH      input image height.
+ * @param[in]   inImgW      input image width.
+ * @param[in]   inputH      input batchSize.
+ * @param[in]   inputW      input image data dim.
+ * @param[in]   outGrad     output gradient.
+ * @param[in]   outImgH     output image height.
+ * @param[in]   outImgW     output image width.
+ * @param[in]   outputH     output batchSize.
+ * @param[in]   outputW     output image data dim.
+ * @param[in]   numChannels number of channels.
+ * @param[in]   ratioH      inImgH / outImgH.
+ * @param[in]   ratioW      inImgW / outImgW.
+ *
+ */
 extern void hl_bilinear_backward(real* inGrad,
                                  const size_t inImgH,
                                  const size_t inImgW,
diff --git a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
index f35bfbc5c8253d632f8089f5037421f527633aad..9c49a4bd2083794e98b099b25944bedec3d5a2ff 100644
--- a/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_cpu_matrix_kernel.cuh
@@ -17,7 +17,11 @@ limitations under the License. */
 
 #include <stdio.h>
 #include "hl_base.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include "hl_neon_matrix_kernel.cuh"
+#else
 #include "hl_sse_matrix_kernel.cuh"
+#endif
 
 /**
  * @brief   cpu element wise unary operator.
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index abd5eb3a0cf338c689680dd0f7192be7b2530383..eb454c59c1e58cf2b4817b4cb3230b9d75e320ac 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d,
                                         const int* index,
                                         int numSequence);
 
-/**
- * @brief   Matrix classification error.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input vector (M x 1).
- * @param[out]  C_d     output vector (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
-
 /**
  * @brief   Matrix cross entropy.
  *
@@ -188,48 +175,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                         int width,
                                         int height,
                                         int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);
 
 /**
  * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
@@ -267,4 +212,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimN,
                                           real scale);
 
+/**
+ * @brief  Matrix rotation in 90 degrees
+ *
+ * @param[in]   mat       input matrix (M x N).
+ * @param[out]  matRot    output matrix (N x M).
+ * @param[in]   dimM      input matrix height.
+ * @param[in]   dimN      input matrix width.
+ * @param[in]   clockWise rotation direction
+ */
+extern void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/hl_matrix_base.cuh b/paddle/cuda/include/hl_matrix_base.cuh
index db35ee2037433163ebb3673edb350e3fab71fba9..8b755c1095c2c4fdb7e74d8cddc948e6a6af380b 100644
--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -66,6 +66,8 @@ typedef BaseOp SSESquaredDiff;
 typedef BaseOp SSEFirst;
 typedef BaseOp SSESecond;
 typedef BaseOp SSEClassificationError;
+#elif defined(__ARM__NEON__) || defined(__ARM_NEON)
+#include "hl_matrix_base_neon.cuh"
 #else
 #include "hl_matrix_base_sse.cuh"
 #endif
diff --git a/paddle/cuda/include/hl_matrix_base_neon.cuh b/paddle/cuda/include/hl_matrix_base_neon.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e13019f5ee24ad600005c99678426ee3808b0e54
--- /dev/null
+++ b/paddle/cuda/include/hl_matrix_base_neon.cuh
@@ -0,0 +1,161 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_MATRIX_BASE_NEON_CUH_
+#define HL_MATRIX_BASE_NEON_CUH_
+
+namespace aggregate {
+class SSESum {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEMax {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmaxq_f32(a, b);
+  }
+};
+
+class SSEMin {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vminq_f32(a, b);
+  }
+};
+}  // namespace aggregate
+
+namespace base {
+namespace unary {
+class SSEIdentity {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a) const {
+    return a;
+  }
+};
+}  // namespace unary
+
+namespace binary {
+class SSEAdd {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vaddq_f32(a, b);
+  }
+};
+
+class SSEAdd2 {
+public:
+  static const bool sse = true;
+  const real p1;
+  const real p2;
+  float32x4_t mp1;
+  float32x4_t mp2;
+
+public:
+  SSEAdd2(const real s1, const real s2) : p1(s1), p2(s2) {
+    mp1 = vdupq_n_f32(p1);
+    mp2 = vdupq_n_f32(p2);
+  }
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp1, tmp2;
+    tmp1 = vmulq_f32(mp1, a);
+    tmp2 = vmulq_f32(mp2, b);
+    return vaddq_f32(tmp1, tmp2);
+  }
+};
+
+class SSESub {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vsubq_f32(a, b);
+  }
+};
+
+class SSEMul {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return vmulq_f32(a, b);
+  }
+};
+
+class SSEDiv {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vrecpeq_f32(b);
+    return vmulq_f32(a, tmp);
+  }
+};
+
+class SSESquaredDiff {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    float32x4_t tmp;
+    tmp = vsubq_f32(a, b);
+    return vmulq_f32(tmp, tmp);
+  }
+};
+
+class SSEFirst {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return a;
+  }
+};
+
+class SSESecond {
+public:
+  static const bool sse = true;
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    return b;
+  }
+};
+
+class SSEClassificationError {
+public:
+  static const bool sse = true;
+  const real p;
+  float32x4_t mp;
+  uint32x4_t result;
+
+public:
+  explicit SSEClassificationError(const real s) : p(s) {
+    mp = vdupq_n_f32(p);
+    result = vdupq_n_u32(1);
+  }
+  // TODO: to be check
+  INLINE float32x4_t vecOp(const float32x4_t a, const float32x4_t b) const {
+    uint32x4_t tmp1 = vcgtq_f32(a, mp);
+    uint32x4_t tmp2 = vcgtq_f32(b, mp);
+    uint32x4_t tmp3 = veorq_u32(tmp1, tmp2);
+    return vcvtq_f32_u32(vandq_u32(tmp3, result));
+  }
+};
+}  // namespace binary
+}  // namespace base
+
+#endif /* HL_MATRIX_BASE_NEON_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix_type.cuh b/paddle/cuda/include/hl_matrix_type.cuh
index 59213eee75f50d3c054ed8684a9a0e1053342a0a..f965ba966793f6f6eea0ad3606f60553fe904dda 100644
--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -17,13 +17,20 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__)
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
 #endif
+#elif (defined  __ARM_NEON) || (defined __ARM_NEON__)
+#include <arm_neon.h>
+#ifndef PADDLE_TYPE_DOUBLE
+typedef float32x4_t  vecType;
+#else
+#error NEON instructions does not support double precision
+#endif
 #else
 #include <mmintrin.h>
 #include <xmmintrin.h>
diff --git a/paddle/cuda/include/hl_neon_matrix_kernel.cuh b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7b4e5b00079b66d0a46a1344a43f41962cf50f10
--- /dev/null
+++ b/paddle/cuda/include/hl_neon_matrix_kernel.cuh
@@ -0,0 +1,299 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#ifndef HL_NEON_MATRIX_KERNEL_CUH_
+#define HL_NEON_MATRIX_KERNEL_CUH_
+
+#include "hl_matrix_type.cuh"
+
+#define VECTOR_SIZE     16
+
+/* number of float in vector */
+#define     VECTOR_LEN      4
+#define     VECTOR_SET      vdupq_n_f32
+
+inline bool hl_check_align(size_t size) {
+  return !(size & (VECTOR_SIZE - 1));
+}
+
+inline bool hl_check_align(void *ptr) {
+  return hl_check_align(reinterpret_cast<size_t>(ptr));
+}
+
+template <class Agg>
+inline real hl_agg_op(Agg agg, vecType mm) {
+  float32x4_t rev = vrev64q_f32(mm);
+  float32x4_t tmp1 = agg.vecOp(rev, rev);
+  float32x2_t lo = vget_high_f32(rev);
+  float32x2_t hi = vget_low_f32(rev);
+  float32x4_t tmp2 = vcombine_f32(hi, lo);
+  float32x4_t ret = agg.vecOp(tmp1, tmp2);
+
+  return vgetq_lane_f32(ret, 0);
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda) {
+  for (int i = 0; i < dimM; i++, A += lda) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++) {
+      mm = agg.vecOp(mm, op.vecOp(*a));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+        tmp = agg(tmp, op(a[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+      dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_row_op(Agg agg, Op op, Saver sv,
+                          int dimM, int dimN,
+                          real *dst, int ld,
+                          real *A, int lda,
+                          real *B, int ldb) {
+  for (int i = 0; i < dimM; i++, A += lda, B += ldb) {
+    vecType mm = VECTOR_SET(agg.init());
+    vecType *a = (vecType*)(A);
+    vecType *b = (vecType*)(B);
+    for (int j = 0; j < dimN / VECTOR_LEN; j++, a++, b++) {
+        mm = agg.vecOp(mm, op.vecOp(*a, *b));
+    }
+
+    int rem = dimN % VECTOR_LEN;
+    if (rem) {
+      real tmp = hl_agg_op(agg, mm);
+      real *a = A + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      real *b = B + (dimN / VECTOR_LEN) * VECTOR_LEN;
+      for (int j = 0; j < rem; j++) {
+          tmp = agg(tmp, op(a[j], b[j]));
+      }
+      dst[i*ld] = sv(dst[i*ld], tmp);
+    } else {
+        dst[i*ld] = sv(dst[i*ld], hl_agg_op(agg, mm));
+    }
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_matrix_column_op(Agg agg, Op op, Saver sv,
+                         int dimM, int dimN,
+                         real *dst,
+                         real *A, int lda,
+                         real *B, int ldb) {
+  for (int j = 0; j < dimN; j++) {
+    real tmp = agg.init();
+    for (int i = 0; i < dimM; i++) {
+        tmp = agg(tmp, op(A[i * lda + j], B[i * ldb + j]));
+    }
+    dst[j] = sv(dst[j], tmp);
+  }
+}
+
+/*
+ * MaxRow greater than or equal dimN
+ * dimN is multiples of VECTOR_LEN
+ * so rem <= MaxRow / VECTOR_LEN
+ */
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda);
+  }
+}
+
+/*
+ * dimN is multiples of VECTOR_LEN
+ * dimN greater than Step
+ */
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(agg, op, sv, dimM, remRow, dst, A, lda);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda);
+  }
+}
+
+template <int MaxRow, class Agg, class Op, class Saver>
+void hl_sse_column_op_with_rem(Agg agg, Op op, Saver sv,
+                               int dimM, int dimN,
+                               real *dst,
+                               real *A, int lda,
+                               real *B, int ldb) {
+  vecType mm[MaxRow / VECTOR_LEN];
+  for (int n = 0; n < MaxRow / VECTOR_LEN; n++) {
+    mm[n] = VECTOR_SET(agg.init());
+  }
+
+  for (int i = 0; i < dimM; i++) {
+    vecType *a = (vecType*)(A + i * lda);
+    vecType *b = (vecType*)(B + i * ldb);
+    for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+      mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+    }
+  }
+
+  vecType *result = (vecType*)(dst);
+  for (int n = 0; n < dimN / VECTOR_LEN; n++) {
+    result[n] = sv.vecOp(result[n], mm[n]);
+  }
+
+  int rem = dimN % VECTOR_LEN;
+  if (rem) {
+    A += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    B += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    dst += (dimN / VECTOR_LEN) * VECTOR_LEN;
+    hl_matrix_column_op(agg, op, sv, dimM, rem, dst, A, lda, B, ldb);
+  }
+}
+
+template <int Step, class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  for (int j = 0; j < dimN / Step; j++, dst += Step, A += Step, B += Step) {
+    vecType mm[Step / VECTOR_LEN];
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      mm[n] = VECTOR_SET(agg.init());
+    }
+
+    for (int i = 0; i < dimM; i++) {
+      vecType *a = (vecType*)(A + i * lda);
+      vecType *b = (vecType*)(B + i * ldb);
+      for (int n = 0; n < Step / VECTOR_LEN; n++) {
+        mm[n] = agg.vecOp(mm[n], op.vecOp(a[n], b[n]));
+      }
+    }
+
+    vecType *result = (vecType*)(dst);
+    for (int n = 0; n < Step / VECTOR_LEN; n++) {
+      result[n] = sv.vecOp(result[n], mm[n]);
+    }
+  }
+
+  int remRow = dimN % Step;
+  if (remRow) {
+    hl_sse_column_op_with_rem<Step>(
+        agg, op, sv, dimM, remRow, dst, A, lda, B, ldb);
+  }
+}
+
+template <class Agg, class Op, class Saver>
+void hl_sse_matrix_column_op(Agg agg, Op op, Saver sv,
+                             int dimM, int dimN,
+                             real *dst,
+                             real *A, int lda,
+                             real *B, int ldb) {
+  if (dimN <= 16) {
+    hl_sse_matrix_column_op<16>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 32) {
+    hl_sse_matrix_column_op<32>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else if (dimN <= 1024 || dimM <= 512) {
+    hl_sse_matrix_column_op<64>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  } else {
+    hl_sse_matrix_column_op<1024>(agg, op, sv, dimM, dimN, dst, A, lda, B, ldb);
+  }
+}
+
+#endif /* HL_NEON_MATRIX_KERNEL_CUH_ */
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9f9d8f972e3a4c62e5caedcf85054be5681b96c1..973ddcceed99ba4177b3db277e664611d42ac51b 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -159,4 +159,10 @@ extern void hl_sequence_avg_forward(real* dst,
                                     int width,
                                     const int mode);
 
+extern void hl_sequence_avg_backward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode);
 #endif /* HL_SEQUENCE_H_ */
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index 77949ed295a6eaf7cc535853e53bef066ffac37c..79ae0d0e741de06e622454ccd220e2c749d795b3 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal,
                                    int beamSize,
                                    int numSamples);
 
-#endif /* HL_TOP_K_H_ */
+/**
+ * @brief   Matrix classification error.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            input value.
+ * @param[in]   lds            leading dimension of src.
+ * @param[in]   dim            width of input value.
+ * @param[in]   topkSize       size of top k element.
+ * @param[in]   numSamples     height of input value.
+ * @param[in]   label          ground truth label.
+ * @param[out]  recResult      top-k classification error.
+ *
+ */
+extern void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult);
+
+#endif  // HL_TOP_K_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 0b669f6735cb9771fd63ed8e3b45602db0db447c..127cb7e27983e8ff2c1ff6ef5108b5f8c5bd6ca5 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d,
 inline void hl_matrix_softmax_derivative(
     real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
 
-inline void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+inline void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult) {}
 
 inline void hl_matrix_cross_entropy(
     real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
@@ -74,25 +82,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                         int height,
                                         int partial_sum) {}
 
-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                       real* B_d,
                                       const int channel,
@@ -106,4 +95,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimM,
                                           const int dimN,
                                           real scale) {}
+
+inline void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index 05e51bce9e1df6fc6ef1cad891b44a9172da185d..920b417b1c717efaff75f70f1b9d2b574469e425 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -57,4 +57,10 @@ inline void hl_sequence_avg_forward(real* dst,
                                     int width,
                                     const int mode) {}
 
+inline void hl_sequence_avg_backward(real* dst,
+                                     real* src,
+                                     const int* starts,
+                                     int height,
+                                     int width,
+                                     const int mode) {}
 #endif  // HL_SEQUENCE_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index 182e8ab218cce18448f8a08f5c1a1dab7e38f2b6..6163209e9bc681209712243ba68dec549b7e360a 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #include "hl_cuda_cublas.h"
 #include <sys/time.h>
-#include <mutex>
 #include "hl_cuda.h"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 6198f067bab2ec790e641e77dce058fe6a52491a..c53a5636829cab9d575f58cc2326cb3efe383e1c 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "hl_cuda_cudnn.h"
 #include <cudnn.h>
 #include <gflags/gflags.h>
-#include <mutex>
 #include "hl_cuda_cudnn.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index a71eecba2736234dafaf6b67e5efac5358a30871..4042d9742a92f6718406c8923d9129b81afe89e7 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -21,12 +21,10 @@ limitations under the License. */
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <unistd.h>
-#include <mutex>
-#include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/DynamicLoader.h"
 // clang-format on
 
 namespace dynload {
@@ -77,78 +75,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)        \
-  __macro(cudaMalloc)                     \
-  __macro(cudaHostAlloc)                  \
-  __macro(cudaFree)                       \
-  __macro(cudaFreeHost)                   \
-  __macro(cudaMemcpy)                     \
-  __macro(cudaMemset)                     \
-  __macro(cudaMemcpyAsync)                \
-  __macro(cudaSetDevice)                  \
-  __macro(cudaGetDevice)                  \
-  __macro(cudaGetDeviceCount)             \
-  __macro(cudaGetDeviceProperties)        \
-  __macro(cudaDeviceSynchronize)          \
-  __macro(cudaDeviceCanAccessPeer)        \
-  __macro(cudaDeviceEnablePeerAccess)     \
-  __macro(cudaStreamCreate)               \
-  __macro(cudaStreamDestroy)              \
-  __macro(cudaStreamSynchronize)          \
-  __macro(cudaStreamWaitEvent)            \
-  __macro(cudaEventCreate)                \
-  __macro(cudaEventRecord)                \
-  __macro(cudaEventQuery)                 \
-  __macro(cudaEventDestroy)               \
-  __macro(cudaEventSynchronize)           \
-  __macro(cudaEventElapsedTime)           \
-  __macro(cudaSetDeviceFlags)             \
-  __macro(cudaGetLastError)               \
-  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)             \
-  __macro(cudaProfilerStart)              \
-  __macro(cudaProfilerStop)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
-
 } /* namespace dynload */
 
 /**
@@ -171,11 +97,11 @@ int g_cuda_lib_version = 0;
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                                                  \
-  do {                                                                        \
-    cudaError_t cudaStat = cudaFunc;                                          \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
-                                    << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc)                                         \
+  do {                                                               \
+    cudaError_t cudaStat = cudaFunc;                                 \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
+                                    << cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
@@ -284,13 +210,13 @@ void hl_fini() {
       tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
     }
 
     /* free device memory */
     hl_free_mem_device(t_device[dev]->gpu_mem);
     hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
   }
 
   free(tmp);
@@ -308,7 +234,7 @@ void hl_set_device(int device) {
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
       << "Device: " << device << " is not specified in startup.";
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* switch thread stream */
   for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -336,7 +262,7 @@ void hl_set_device(int device) {
 
 int hl_get_device() {
   int device;
-  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  CHECK_CUDA(cudaGetDevice(&device));
   return device;
 }
 
@@ -344,7 +270,7 @@ void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -352,7 +278,7 @@ void *hl_malloc_device(size_t size) {
 void hl_free_mem_device(void *dest_d) {
   CHECK_NOTNULL(dest_d);
 
-  cudaError_t err = dynload::cudaFree(dest_d);
+  cudaError_t err = cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -361,8 +287,7 @@ void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(
-      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -370,7 +295,7 @@ void *hl_malloc_host(size_t size) {
 void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
-  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  cudaError_t err = cudaFreeHost(dest_h);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -381,11 +306,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
   }
   CHECK_NOTNULL(dst);
   CHECK_NOTNULL(src);
-  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 }
 
 void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
 }
 
 void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -394,7 +319,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -403,7 +328,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -412,8 +337,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(
-      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -427,8 +351,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(
-      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -439,8 +362,7 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(
-      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -450,9 +372,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
 }
 
 void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
   if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    dynload::cudaGetLastError();
+    cudaGetLastError();
   } else {
     CHECK_CUDA(err);
   }
@@ -463,9 +385,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   int device = device_prop->device;
   global_device_resources device_res = device_prop->device_resources;
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
   /* device properties */
-  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
 
   device_prop->major = cu_prop.major;
   device_prop->minor = cu_prop.minor;
@@ -474,7 +396,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create device stream */
   for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* cublas init */
@@ -501,18 +423,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
-  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
 int hl_get_cuda_version() { return g_cuda_lib_version; }
 
 void hl_create_thread_resources(int device,
                                 thread_device_resources device_res) {
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* create thread stream */
   for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* allocation device memory */
@@ -521,14 +443,14 @@ void hl_create_thread_resources(int device,
   /* allocation host memory */
   device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
-  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
 }
 
 void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
-  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
   CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
   if (device == NULL) {
     number = g_system_device_num;
@@ -640,7 +562,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
 }
 
 void hl_create_event(hl_event_t *event) {
@@ -649,7 +571,7 @@ void hl_create_event(hl_event_t *event) {
   struct _hl_event_st *st_event =
       (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
-  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
 
   *event = st_event;
 }
@@ -659,8 +581,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(
-      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -672,7 +593,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -683,12 +604,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
 
   free(event);
   event = NULL;
@@ -696,7 +617,7 @@ void hl_destroy_event(hl_event_t event) {
 
 void hl_event_synchronize(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
 }
 
 void hl_get_device_name(char *name, int len, int device) {
@@ -725,24 +646,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
 
 const char *hl_get_device_error_string() {
-  cudaError_t err = dynload::cudaGetLastError();
-  return dynload::cudaGetErrorString(err);
+  cudaError_t err = cudaGetLastError();
+  return cudaGetErrorString(err);
 }
 
 const char *hl_get_device_error_string(size_t err) {
-  return dynload::cudaGetErrorString((cudaError_t)err);
+  return cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  cudaError_t err = cudaEventQuery(event->cu_event);
   CHECK(cudaSuccess == err || cudaErrorNotReady == err);
 
   if (cudaErrorNotReady == err) {
@@ -751,6 +672,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   return true;
 }
 
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
 
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 2b4c6f7c39cff78c0e76cc1dfd41e1c7ef334f11..9bcc7fb7de44b2211db450fb164655f7947dcad9 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d,
   CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
 
-template<int blockSize>
-__global__ void KeMatrixClassificationError(real* in_A,
-                                            int* in_B,
-                                            real* out_C,
-                                            int dimN) {
-  __shared__ real max_s[blockSize];
-  __shared__ int max_l[blockSize];
-  const int tid = threadIdx.x;
-  const int rowId = blockIdx.x;
-
-  max_s[tid] = -1e30f;
-  in_A += rowId * dimN;
-  real tmp;
-  for (int colId = tid; colId < dimN; colId += blockSize) {
-    tmp = in_A[colId];
-    if (max_s[tid] < tmp) {
-      max_s[tid] = tmp;
-      max_l[tid] = colId;
-    }
-  }
-  __syncthreads();
-
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      if (max_s[tid] < max_s[tid + stride]) {
-        max_s[tid] = max_s[tid + stride];
-        max_l[tid] = max_l[tid + stride];
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
-  }
-}
-
-void hl_matrix_classification_error(real* A_d,
-                                    int* B_d,
-                                    real* C_d,
-                                    int dimM,
-                                    int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  // each sample is calculated by one block
-  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
-    (A_d, B_d, C_d, dimN);
-  CHECK_SYNC("hl_matrix_classification_error");
-}
-
 __global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
                                                 real* entropy,
                                                 int* row,
@@ -584,177 +531,6 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                       real* B,
                                       const int channel,
@@ -840,3 +616,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
       (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
   CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
+
+__global__ void keMatrixRotate(real* mat, real* matRot,
+                               int dimM, int dimN, bool clockWise) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < dimM * dimN) {
+        int i = idx / dimN;
+        int j = idx % dimN;
+        if (clockWise) {
+            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+        } else {
+            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+        }
+    }
+}
+
+void hl_matrix_rotate(real *mat, real* matRot,
+                      int dimM, int dimN, bool clockWise) {
+    CHECK_NOTNULL(mat);
+    CHECK_NOTNULL(matRot);
+    const int threads = 512;
+    const int blocks = DIVUP(dimM * dimN, threads);
+    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+            (mat, matRot, dimM, dimN, clockWise);
+    CHECK_SYNC("hl_matrix_rotate failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index ba823de2720336851bf9c49d8162360af93e8601..0fe2877f89f8d0fbc4db40c400037be30bb87ff7 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -325,12 +325,12 @@ __global__ void KeSequenceAvgForward(real* dst,
     int seqLength = end - start;
     if (seqLength == 0) return;
     real sum = 0.0;
-    for (int i = 0; i < seqLength; i++) {
-      sum += src[(start + i) * width + col];
+    for (int i = start; i < end; i++) {
+      sum += src[i * width + col];
     }
     sum = mode == 1 ? sum :
         (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[row * width + col] = sum;
+    dst[gid] = sum;
   }
 }
 
@@ -354,3 +354,48 @@ void hl_sequence_avg_forward(real* dst,
            (dst, src, starts, height, width, mode);
   CHECK_SYNC("hl_sequence_avg_forward failed");
 }
+
+__global__ void KeSequenceAvgBackward(real* dst,
+                                      real* src,
+                                      const int* starts,
+                                      int height,
+                                      int width,
+                                      const int mode) {
+  int gid = blockIdx.x * blockDim.x + threadIdx.x;
+  int row = gid / width;
+  int col = gid % width;
+
+  if (gid < height * width) {
+    int start = starts[row];
+    int end = starts[row + 1];
+    int seqLength = end - start;
+    if (seqLength == 0) return;
+    real grad = src[gid];
+    grad = mode == 1 ? grad :
+        (mode == 0 ? grad / seqLength : grad * my_rsqrt((real)seqLength));
+    for (int i = start; i < end; i++) {
+      dst[i * width + col] += grad;
+    }
+  }
+}
+
+void hl_sequence_avg_backward(real* dst,
+                              real* src,
+                              const int* starts,
+                              int height,
+                              int width,
+                              const int mode) {
+  CHECK_NOTNULL(dst);
+  CHECK_NOTNULL(src);
+  CHECK_NOTNULL(starts);
+
+  int block = 512;
+  int grid = DIVUP(width * height, 512);
+
+  CHECK(mode == 0 || mode == 1 || mode == 2)
+    << "mode error in hl_sequence_avg_backward!";
+
+  KeSequenceAvgBackward<<< grid, block, 0, STREAM_DEFAULT >>>
+           (dst, src, starts, height, width, mode);
+  CHECK_SYNC("hl_sequence_avg_backward failed");
+}
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
deleted file mode 100644
index ecc03a729dde2f2b4f8f004234a47d9272997a50..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_USE_DSO
-
-#include <cuda_runtime.h>
-#include <mutex>
-#include "hl_dso_loader.h"
-
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    __type operator()(Args... args) {                                          \
-      typedef __type (*cudartFunc)(Args...);                                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)          \
-  __macro(cudaLaunch, cudaError_t)          \
-  __macro(cudaSetupArgument, cudaError_t)   \
-  __macro(cudaConfigureCall, cudaError_t)   \
-  __macro(__cudaRegisterFatBinary, void**)  \
-  __macro(__cudaUnregisterFatBinary, void)  \
-  __macro(__cudaRegisterFunction, void)     \
-  __macro(__cudaRegisterVar, void)          \
-  __macro(__cudaRegisterManagedVar, void)   \
-  __macro(__cudaInitModule, char)           \
-  __macro(__cudaRegisterTexture, void)      \
-  __macro(__cudaRegisterSurface, void)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-
-#undef CUDA_ROUNTINE_EACH
-
-} /* namespace dynload */
-
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
-                                                dim3 gridDim,
-                                                dim3 blockDim,
-                                                void **args,
-                                                size_t sharedMem,
-                                                cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(
-      func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  return dynload::cudaLaunch(func);
-}
-
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                 size_t size,
-                                                 size_t offset) {
-  return dynload::cudaSetupArgument(arg, size, offset);
-}
-
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
-                                                 dim3 blockDim,
-                                                 size_t sharedMem,
-                                                 cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-
-extern "C" {
-
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
-                                      const char *hostFun,
-                                      char *deviceFun,
-                                      const char *deviceName,
-                                      int thread_limit,
-                                      uint3 *tid,
-                                      uint3 *bid,
-                                      dim3 *bDim,
-                                      dim3 *gDim,
-                                      int *wSize) {
-  return dynload::__cudaRegisterFunction(fatCubinHandle,
-                                         hostFun,
-                                         deviceFun,
-                                         deviceName,
-                                         thread_limit,
-                                         tid,
-                                         bid,
-                                         bDim,
-                                         gDim,
-                                         wSize);
-}
-
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
-                                 char *hostVar,
-                                 char *deviceAddress,
-                                 const char *deviceName,
-                                 int ext,
-                                 int size,
-                                 int constant,
-                                 int global) {
-  return dynload::__cudaRegisterVar(fatCubinHandle,
-                                    hostVar,
-                                    deviceAddress,
-                                    deviceName,
-                                    ext,
-                                    size,
-                                    constant,
-                                    global);
-}
-
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
-                                               void **hostVarPtrAddress,
-                                               char *deviceAddress,
-                                               const char *deviceName,
-                                               int ext,
-                                               int size,
-                                               int constant,
-                                               int global) {
-  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
-                                           hostVarPtrAddress,
-                                           deviceAddress,
-                                           deviceName,
-                                           ext,
-                                           size,
-                                           constant,
-                                           global);
-}
-
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  return dynload::__cudaInitModule(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
-                                     const struct textureReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int norm,
-                                     int ext) {
-  return dynload::__cudaRegisterTexture(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
-                                     const struct surfaceReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int ext) {
-  return dynload::__cudaRegisterSurface(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-
-} /* extern "C" */
-
-#endif
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index f0ef0cc3c51f9e7935dc3c40f630e4d70960802a..4f0bbfcf4e3aa51dd06acf254af65c62098a1df7 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
   CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
 
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template<int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+                                                int * topIds,
+                                                real* src, int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+        if (*--topIds == label[blockIdx.x]) {
+            recResult[blockIdx.x] = 0;
+            break;
+        }
+        recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal, int ldv,
+                                   int* topIds,
+                                   real* src, int lds,
+                                   int dim,
+                                   int topkSize,
+                                   int numSamples,
+                                   int* label,
+                                   real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256>
+  <<< grid, threads, 0, STREAM_DEFAULT >>>
+  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 9ae8bc0f220e143a5c59d8c3ead012a20369e7b9..9f812dd0dead8b4b4e0a4ac58b12a81d1da00aee 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "hl_warpctc_wrap.h"
 #include <mutex>
-#include "hl_dso_loader.h"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
@@ -29,7 +29,6 @@ void* warpctc_dso_handle = nullptr;
  * false, you need to add the path of libwarp-ctc.so to
  * the linked-libs of paddle or to LD_PRELOAD.
  */
-#ifdef PADDLE_USE_DSO
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
   struct DynLoad__##__name {                                           \
     template <typename... Args>                                        \
@@ -41,15 +40,6 @@ void* warpctc_dso_handle = nullptr;
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
     }                                                                  \
   } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name;  // struct DynLoad__##__name
-#endif
 
 // include all needed warp-ctc functions
 DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
@@ -64,22 +54,26 @@ DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
 #define WARPCTC_GET_VERSION dynload::get_warpctc_version
 #define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
 
+static int g_warpctcVersion = -1;
 #ifndef PADDLE_TYPE_DOUBLE
 #define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
 #define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
 #else
-#define WARPCTC_LOG_FATAL                                \
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
-             << "] Error: not support double precision."
-#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
-#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+hl_warpctc_status_t fatal(...) {
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
+             << "] Error: not support double precision.";
+  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
+  // type value
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+#define WARPCTC_COMPUTE_LOSS fatal
+#define WARPCTC_GET_WORKSPACE_SIZE fatal
 #endif
 
 /**
  * Check build-in warp-ctc function using glog and it also
  * support << operator for more details error info.
  */
-static int g_warpctcVersion = -1;
 #define CHECK_WARPCTC(warpctcStat)                \
   CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
       << "warp-ctc [version " << g_warpctcVersion \
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b70036e3ff7de9e8786bade03e220a4916db4c2
--- /dev/null
+++ b/paddle/function/BufferArg.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dc7792f646457c22ee4791f18814afaa3809f7b
--- /dev/null
+++ b/paddle/function/BufferArg.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
+};
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
+public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
+  virtual size_t numElements() const { return shape_.getElements(); }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
+  // TODO(tianbing), add deviceType_
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    CHECK_GE(shape_[0], 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+private:
+  size_t numSeqs_;
+};
+
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
+class SequenceArg : public BufferArg {
+public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        startPositions_(TensorShape({shape[0]})) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
+
+private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+    CHECK_EQ(row_.shape().ndims(), 1UL);
+    CHECK_EQ(col_.shape().ndims(), 1UL);
+    if (format_ == T_SPARSE_CSR) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format_ == T_SPARSE_CSC) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(ValueType valueType,
+                  const TensorShape& shape,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+
+    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
+    row_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
+    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
+    col_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2UL, shape_.ndims());
+    return typename Tensor<real, DType>::SparseMatrix(
+        reinterpret_cast<real*>(buf_),
+        reinterpret_cast<int*>(row_.data()),
+        reinterpret_cast<int*>(col_.data()),
+        shape_[0],
+        shape_[1],
+        nnz_,
+        static_cast<SparseValueType>(type_),
+        static_cast<SparseFormat>(format_),
+        false);
+  }
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  size_t numElements() const override { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1744f377808f137dcda4a28acce336dc22be3d01
--- /dev/null
+++ b/paddle/function/BufferArgTest.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index de85eeca821742e1d39d5ce26f873238d4359cba..233a53709a80f06dd2a06995b159c1aef10e2788 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -3,6 +3,7 @@ file(GLOB cpp_files . *Op.cpp)
 
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
 
 if(WITH_GPU)
     file(GLOB cu_files . *OpGpu.cu)
@@ -11,7 +12,7 @@ endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-
+add_dependencies(paddle_function gen_proto_cpp)
 
 if(WITH_GPU)
 if(WITH_TESTING)
@@ -19,7 +20,14 @@ if(WITH_TESTING)
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
     add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
     add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(PadOpTest)
+    add_simple_unittest(MulOpTest)
+    add_simple_unittest(CosSimOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index 07907fc1ba7973c728c3a882e4be6b1a7ef7a97a..b87750b74247bd0eb822340bc5a85d41b86ecec2 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -17,19 +17,20 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 
 namespace paddle {
-
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
 template <>
-void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
-                                               const CpuMatrix* input_mat,
-                                               const CpuMatrix* weight_mat,
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
                                                const CpuIVector& seq_vec,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad) {
   const int* starts = seq_vec.getData();
   const size_t num_sequences = seq_vec.getSize() - 1;
-  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
-  auto in_mat = const_cast<CpuMatrix*>(input_mat);
   for (size_t i = 0; i < num_sequences; ++i) {
     for (size_t j = 0; j < context_length; ++j) {
       int begin = starts[i] + context_start + j;
@@ -39,10 +40,11 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
       if (begin < starts[i]) {
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
-        if (w_mat) {
-          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
         }
         dst_begin = starts[i] + pad_size;
         begin = starts[i];
@@ -50,28 +52,51 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
       if (end > starts[i + 1]) {
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
-        if (w_mat) {
-          MatrixPtr sub = w_mat->subMatrix(
-              begin_pad + context_start + j - pad_size, pad_size);
-          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
         }
         dst_end = starts[i + 1] - pad_size;
         end = starts[i + 1];
       }
       if (end <= begin) continue;
-      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
-      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
-      dst->addAtOffset(*src, j * in_mat->getWidth());
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
     }
   }
 }
 
 /**
- * \param inputs[0] input value.
- * \param inputs[1] input weight.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -82,40 +107,39 @@ public:
     begin_pad_ = config.get<size_t>("begin_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(3, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(1UL == inputs.size() || 2UL == inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    if (2UL == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+      /// dim of input == dim of weight
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
 
-    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
     const auto w_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
-
-    ContextProjectionForward<Device>(out_mat.get(),
-                                     in_mat.get(),
-                                     w_mat.get(),
+        (2UL == inputs.size() && inputs[1].data())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
+
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
                                      seq_vec,
                                      context_length_,
                                      context_start_,
@@ -128,19 +152,22 @@ private:
   size_t begin_pad_;
 };
 
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
-                                                CpuMatrix* in_grad_mat,
-                                                CpuMatrix* w_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
                                                 size_t context_length,
                                                 int context_start,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-  CHECK(out_grad_mat);
-  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
-                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
   const int* starts = seq_vec.getData();
   size_t num_sequences = seq_vec.getSize() - 1;
   for (size_t i = 0; i < num_sequences; ++i) {
@@ -153,8 +180,9 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
         dst_begin = starts[i] + pad_size;
@@ -164,9 +192,9 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat =
-              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
-          MatrixPtr sub = w_grad_mat->subMatrix(
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -175,18 +203,23 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
       }
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
-      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
 }
 
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] weight grad.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -199,44 +232,44 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(3, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(outputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
-
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK(1UL == outputs.size() || 2UL == outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
+
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+
+    if (2UL == outputs.size()) {
+      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
+      /// dim of input grad == dim of weight
+      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    }
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
     auto w_grad_mat =
-        !inputs[1].getData()
-            ? nullptr
-            : std::make_shared<typename MatrixT<Device>::type>(
-                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+        (2UL == outputs.size() && outputs[1].data())
+            ? outputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
 
-    ContextProjectionBackward<Device>(out_grad_mat.get(),
-                                      in_grad_mat ? in_grad_mat.get() : nullptr,
-                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
                                       seq_vec,
                                       context_length_,
                                       context_start_,
@@ -254,9 +287,15 @@ private:
 };
 
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -266,32 +305,30 @@ public:
     context_start_ = config.get<int>("context_start");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
 
-    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
-                                          in_grad_mat.get(),
-                                          seq_vec,
-                                          context_length_,
-                                          context_start_);
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
 private:
@@ -300,9 +337,14 @@ private:
 };
 
 /**
- * \param inputs[0] weight grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -314,28 +356,25 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
-
-    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
-                                            w_grad_mat.get(),
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
                                             seq_vec,
                                             context_length_,
                                             context_start_,
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index 93eb050fde35f474750f3c2efa72b7471f654b75..6f7d936379a5378e6fd85dd86618d1b6094bd14f 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
 #include "Function.h"
 
 namespace paddle {
@@ -21,24 +20,25 @@ namespace paddle {
 /**
  * \brief   Context Projection Forward.
  *
- * \param[out]  outputs           output data.
- * \param[in]   input             input data.
- * \param[in]   weight            input weight.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
+ * \param[in/out]  outputs           output data.
+ * \param[in]      input             input data.
+ * \param[in]      weight            input weight.
+ * \param[in]      sequence          input data.
+ * \param[in]      context_length    consecutive rows for concatenation.
+ * \param[in]      context_start     context start position.
+ * \param[in]      begin_pad         begining pad position.
+ * \param[in]      is_padding        whether padding 0 or not.
  *
  */
-template <DeviceType Device>
-void ContextProjectionForward(typename MatrixT<Device>::type* output,
-                              const typename MatrixT<Device>::type* input,
-                              const typename MatrixT<Device>::type* weight,
-                              const typename SequenceT<Device>::type& sequence,
-                              size_t context_length,
-                              int context_start,
-                              size_t begin_pad);
+template <DeviceType DType>
+void ContextProjectionForward(
+    typename Tensor<real, DType>::Matrix& output,
+    const typename Tensor<real, DType>::Matrix& input,
+    const typename Tensor<real, DType>::Matrix& weight,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad);
 
 /**
  * \brief   Context Projection Backward.
@@ -53,30 +53,31 @@ void ContextProjectionForward(typename MatrixT<Device>::type* output,
  * \param[in]   is_padding        whether padding 0 or not.
  *
  */
-template <DeviceType Device>
-void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
-                               typename MatrixT<Device>::type* in_grad,
-                               typename MatrixT<Device>::type* w_grad,
-                               const typename SequenceT<Device>::type& seq_vec,
-                               size_t context_length,
-                               int context_start,
-                               size_t begin_pad,
-                               bool is_padding,
-                               size_t total_pad);
+template <DeviceType DType>
+void ContextProjectionBackward(
+    const typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad,
+    bool is_padding,
+    size_t total_pad);
 
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename MatrixT<Device>::type* out_grad,
-    typename MatrixT<Device>::type* in_grad,
-    const typename SequenceT<Device>::type& sequence,
+    const typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
     int context_start);
 
-template <DeviceType Device>
+template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename MatrixT<Device>::type* out_grad,
-    typename MatrixT<Device>::type* w_grad,
-    const typename SequenceT<Device>::type& seq_vec,
+    const typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
     int context_start,
     size_t total_pad,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 1ec7058f96c8200728e5add051d5fa6a77a97e36..1a5b4042402df3081a493962a5e080d72b7f40b2 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -120,29 +120,28 @@ void hl_context_projection_forward(const real* input,
 }
 
 template <>
-void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
-                                               const GpuMatrix* input,
-                                               const GpuMatrix* weight,
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
+                                               const GpuMatrix& input,
+                                               const GpuMatrix& weight,
                                                const GpuIVector& sequence,
                                                size_t context_length,
                                                int context_start,
                                                size_t begin_pad) {
-  CHECK(input && output);
-  hl_context_projection_forward(input->getData(),
+  hl_context_projection_forward(input.getData(),
                                 sequence.getData(),
-                                weight ? weight->getData() : nullptr,
-                                output->getData(),
+                                weight ? weight.getData() : nullptr,
+                                output.getData(),
                                 sequence.getSize() - 1,
-                                input->getWidth(),
+                                input.getWidth(),
                                 context_length,
                                 context_start,
                                 begin_pad);
 }
 
-__global__ void KeContextProjectionBackwardData(real* out_grad,
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
                                                 const int* sequence,
                                                 real* in_grad,
-                                                int input_dim,
+                                                size_t input_dim,
                                                 int context_length,
                                                 int context_start) {
   int idx = threadIdx.x;
@@ -153,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
-  out_grad += seq_start * input_dim * context_length;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
   in_grad += seq_start * input_dim;
   for (int k = 0; k <= input_dim / block_size; k++) {
     if (idx < input_dim) {
@@ -170,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out_grad + outy * input_dim * context_length + outx * input_dim;
+          out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -195,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
  * @param[in]   context_start    context start.
  *
  */
-void hl_context_projection_backward_data(real* out_grad,
+void hl_context_projection_backward_data(const real* out_grad,
                                          const int* sequence,
                                          real* input_grad,
                                          size_t num_sequences,
@@ -217,23 +217,22 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
-                                                    GpuMatrix* in_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                                    GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
                                                     int context_start) {
-  CHECK(in_grad && out_grad);
-  hl_context_projection_backward_data(out_grad->getData(),
+  hl_context_projection_backward_data(out_grad.getData(),
                                       sequence.getData(),
-                                      in_grad->getData(),
+                                      in_grad.getData(),
                                       sequence.getSize() - 1,
-                                      in_grad->getWidth(),
+                                      in_grad.getWidth(),
                                       context_length,
                                       context_start);
 }
 
 template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
                                                   int num_sequences,
@@ -256,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
       int seq_end = sequence[seqId+1];
-      output_r = out_grad + seq_start * w_dim * context_length;
+      output_r = const_cast<real*>(out_grad)
+                    + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
@@ -320,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
  * beginning.
  *
  */
-void hl_context_projection_backward_weight(real* out_grad,
+void hl_context_projection_backward_weight(const real* out_grad,
                                            const int* sequence,
                                            real* w_grad,
                                            size_t num_sequences,
@@ -348,19 +348,18 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix* out_grad,
-        GpuMatrix* w_grad,
+        const GpuMatrix& out_grad,
+        GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
         int context_start,
         size_t total_pad,
         size_t begin_pad) {
-  CHECK(out_grad && w_grad);
-  hl_context_projection_backward_weight(out_grad->getData(),
+  hl_context_projection_backward_weight(out_grad.getData(),
                                         seq_vec.getData(),
-                                        w_grad->getData(),
+                                        w_grad.getData(),
                                         seq_vec.getSize() - 1,
-                                        w_grad->getWidth(),
+                                        w_grad.getWidth(),
                                         total_pad,
                                         context_length,
                                         context_start,
@@ -368,16 +367,15 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
-                                                GpuMatrix* in_grad,
-                                                GpuMatrix* w_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                                GpuMatrix& in_grad,
+                                                GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
                                                 size_t context_length,
                                                 int context_start,
                                                 size_t begin_pad,
                                                 bool is_padding,
                                                 size_t total_pad) {
-    CHECK(out_grad);
     if (in_grad) {
         ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
                 out_grad,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 6223d2fd23ac3bbb4fbcf51d37d22feaf3b1330b..1b25172ca5c0c4e64db01806fb8239af7e06d90d 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,56 +28,31 @@ void testMatrixProjectionForward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare compare("ContextProjectionForward",
-                          FuncConfig()
-                              .set("context_length", context_length)
-                              .set("context_start", context_start)
-                              .set("begin_pad", std::max(0, -context_start)));
-
-  CpuMatrix cpu_in(batch_size, input_dim);
-  cpu_in.randomizeUniform();
-  GpuMatrix gpu_in(batch_size, input_dim);
-  gpu_in.copyFrom(cpu_in);
-  auto cpu_weight =
-      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
-  auto gpu_weight =
-      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
-  if (is_padding) {
-    cpu_weight->randomizeUniform();
-    gpu_weight->copyFrom(*cpu_weight);
+  FunctionCompare test(
+      "ContextProjectionForward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start)));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
+  if (is_padding) {  // weight
+    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
   }
-  IVectorPtr cpu_seq;
-  generateSequenceStartPositions(batch_size, cpu_seq);
-  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
-  gpu_seq->copyFrom(*cpu_seq);
-
-  CpuMatrix cpu_out(batch_size, input_dim * context_length);
-  GpuMatrix gpu_out(batch_size, input_dim * context_length);
-  cpu_out.randomizeUniform();
-  gpu_out.copyFrom(cpu_out);
-
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT,
+                  TensorShape{batch_size, input_dim * context_length}),
+      ADD_TO);
 
-  autotest::TensorCheckEqual(cpu_out, gpu_out);
+  // run Function
+  test.run();
 }
 
 void testMatrixProjectionBackward(int context_start,
-                                  int context_length,
+                                  size_t context_length,
                                   bool is_padding,
                                   size_t batch_size,
                                   size_t input_dim) {
@@ -85,65 +60,32 @@ void testMatrixProjectionBackward(int context_start,
                std::max(0, (int)(context_start + context_length - 1));
   if (pad == 0) is_padding = false;
 
-  FunctionCompare compare("ContextProjectionBackward",
-                          FuncConfig()
-                              .set("context_length", context_length)
-                              .set("context_start", context_start)
-                              .set("begin_pad", std::max(0, -context_start))
-                              .set("is_padding", is_padding)
-                              .set("total_pad", pad));
-
-  CpuMatrix cpu_in_grad(batch_size, input_dim);
-  cpu_in_grad.randomizeUniform();
-  GpuMatrix gpu_in_grad(batch_size, input_dim);
-  gpu_in_grad.copyFrom(cpu_in_grad);
-
-  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
-  cpu_out_grad.randomizeUniform();
-  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
-  gpu_out_grad.copyFrom(cpu_out_grad);
-
-  IVectorPtr cpu_seq;
-  generateSequenceStartPositions(batch_size, cpu_seq);
-  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
-  gpu_seq->copyFrom(*cpu_seq);
-
-  auto cpu_w_grad =
-      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
-  auto gpu_w_grad =
-      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
-  if (is_padding) {
-    cpu_w_grad->randomizeUniform();
-    gpu_w_grad->copyFrom(*cpu_w_grad);
+  FunctionCompare test(
+      "ContextProjectionBackward",
+      FuncConfig()
+          .set("context_length", context_length)
+          .set("context_start", context_start)
+          .set("begin_pad", (size_t)std::max(0, -context_start))
+          .set("is_padding", is_padding)
+          .set("total_pad", pad));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(SequenceArg(
+      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
+      ADD_TO);
+  if (is_padding) {  // weight
+    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
+                    ADD_TO);
   }
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
-
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
-
-  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
-  if (is_padding) {
-    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
-  }
+  // run Function
+  test.run();
 }
 
-TEST(ContextProjection, projection) {
+TEST(ContextProjection, Projection) {
   for (auto context_start : {-5, -3, -1, 0, 3}) {
     for (auto context_length : {1, 2, 5, 7}) {
       for (auto trainable_padding : {false, true}) {
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ece7b2dfedaf460741c97b5a700eb632d85cabc
--- /dev/null
+++ b/paddle/function/CosSimOp.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+/**
+ * Cosine Similarity for CpuMatrix
+ *
+ * \param out_mat, output value, size: nSamples * 1.
+ * \param in1_mat, input value 1, size: nSamples * dim.
+ * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale, default 1.0
+ *
+ */
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                    const CpuMatrix& in1_mat,
+                                    const CpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  size_t num_samples = out_mat.getHeight();
+  size_t dim = in1_mat.getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
+  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * Cosine Similarity
+ * for each row i,
+ *   out[i] = scale * cos(input1[i], input2[i])
+ *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
+ * when input2 only has one row, then for each row i,
+ *   out[i] = cos(input1[i], input2[0])
+ *
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 2UL);
+    CHECK_EQ(outputs.size(), 1UL);
+
+    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], 1UL);
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    const auto in1_mat = inputs[0].matrix<Device>();
+    const auto in2_mat = inputs[1].matrix<Device>();
+
+    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
+  }
+
+private:
+  real scale_;
+};
+
+/**
+ * Cosine Similarity Derivative for CpuMatrix
+ *
+ * \param in1_grad  forward input grad 1, size: nSamples * dim.
+ * \param in2_grad  forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param out_grad  backward loss output grad, size : nSamples * 1.
+ * \param out_val   forward output value, size: nSamples * 1.
+ * \param in1_val   forward input value 1, size: nSamples * dim.
+ * \param in2_val   forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale,    default 1.0
+ */
+template <>
+void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
+                                     const CpuMatrix& out_val,
+                                     const CpuMatrix& in1_val,
+                                     const CpuMatrix& in2_val,
+                                     CpuMatrix& in1_grad,
+                                     CpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
+
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+
+  size_t num_samples = out_grad.getHeight();
+  size_t dim = in1_val.getWidth();
+  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
+  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
+  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i,
+              prev_out_x += dim,
+              prev_out_y += inc,
+              prev_grad_x += dim,
+              prev_grad_y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += prev_out_x[j] * prev_out_x[j];
+      square_sum_y += prev_out_y[j] * prev_out_y[j];
+      xy += prev_out_x[j] * prev_out_y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    if (xy == 0) {
+      real reciprocal =
+          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
+        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
+      }
+    } else {
+      real reciprocal_xy = 1.0f / xy;
+      real reciprocal_square_sum_x = 1.0f / square_sum_x;
+      real reciprocal_square_sum_y = 1.0f / square_sum_y;
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] +=
+            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
+                                prev_out_x[j] * reciprocal_square_sum_x);
+        prev_grad_y[j] +=
+            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
+                                prev_out_y[j] * reciprocal_square_sum_y);
+      }
+    }
+  }
+}
+
+/**
+ * Cosine Similarity backward Derivative
+ *
+ * \param outputs[0] forward input grad 1, size: nSamples * dim.
+ * \param outputs[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ */
+template <DeviceType Device>
+class CosSimBackwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 4UL);
+    CHECK_EQ(outputs.size(), 2UL);
+    /// dim of out_grad and out_val == 1, column vector
+    CHECK_EQ(inputs[0].shape()[1], 1UL);
+    CHECK_EQ(inputs[1].shape()[1], 1UL);
+    /// nSamples of out_grad == out_val == in_val1 == in_grad1
+    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
+    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
+    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
+
+    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
+          inputs[3].data() && outputs[0].data() && outputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+
+    const auto out_grad = inputs[0].matrix<Device>();
+    const auto out_val = inputs[1].matrix<Device>();
+    const auto in1_val = inputs[2].matrix<Device>();
+    const auto in2_val = inputs[3].matrix<Device>();
+    auto in1_grad = outputs[0].matrix<Device>();
+    auto in2_grad = outputs[1].matrix<Device>();
+
+    CosSimBackward<Device>(
+        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
+  }
+
+private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/function/CosSimOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..be73064e6375bf1e6c6a7ca6de52e9b9b755880b
--- /dev/null
+++ b/paddle/function/CosSimOp.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Cosine Similarity Forward.
+ * for each row i,
+ * out[i] = scale * cos(in1[i], in2[i])
+ *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
+ *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
+ *
+ * \param[out]  output            output value.
+ * \param[in]   intput1           input value.
+ * \param[in]   intput2           input value.
+ * \param[in]   scale             default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimForward(typename Tensor<real, Device>::Matrix& output,
+                   const typename Tensor<real, Device>::Matrix& input1,
+                   const typename Tensor<real, Device>::Matrix& input2,
+                   real scale);
+
+/**
+ * \brief   Cosine Similarity BackWard for Derivative.
+ *
+ * \param[in]       output grad           backward loss output grad.
+ * \param[in]       output val            forward-output value.
+ * \param[in]       input val1            forward input value 1.
+ * \param[in]       input val2            forward input value 2.
+ * \param[in/out]   input grad            forward input grad 1.
+ * \param[in/out]   input grad            forward input grad 2.
+ * \param[in]       scale                 default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
+                    const typename Tensor<real, Device>::Matrix& out_value,
+                    const typename Tensor<real, Device>::Matrix& in1_value,
+                    const typename Tensor<real, Device>::Matrix& in2_value,
+                    typename Tensor<real, Device>::Matrix& in1_grad,
+                    typename Tensor<real, Device>::Matrix& in2_grad,
+                    real scale);
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c62ab39551f02288618244871ae31c6800df5b42
--- /dev/null
+++ b/paddle/function/CosSimOpGpu.cu
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "CosSimOp.h"
+
+namespace paddle {
+
+template<int block_size>
+__global__ void KeCosSim(real* output,
+                         const real* input1,
+                         const real* input2,
+                         int width,
+                         int input1_height,
+                         int input2_height,
+                         real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  input1 += ty * width;
+  if (input2_height > 1) {
+    input2 += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = input1[index];
+    real y = input2[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
+  }
+}
+
+void hlCossim(real* output,
+              const real* input1,
+              const real* input2,
+              size_t width,
+              size_t input1_height,
+              size_t input2_height,
+              real scale) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input1);
+  CHECK_NOTNULL(input2);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (output, input1, input2, width, input1_height, input2_height, scale);
+  CHECK_SYNC("hlCossim failed");
+}
+
+template <>
+void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
+                                    const GpuMatrix& in1_mat,
+                                    const GpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
+      << "Matrix type are not GPU";
+
+  size_t dim = in1_mat.getWidth();
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
+}
+
+template<int block_size>
+__global__ void KeCosSimDerivative(const real* grad,
+                                   const real* output,
+                                   const real* prev_out_x,
+                                   const real* prev_out_y,
+                                   real* prev_grad_x,
+                                   real* prev_grad_y,
+                                   size_t width,
+                                   size_t input1_height,
+                                   size_t input2_height,
+                                   real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  prev_out_x += ty * width;
+  prev_grad_x += ty * width;
+  if (input2_height > 1) {
+    prev_out_y += ty * width;
+    prev_grad_y += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = prev_out_x[index];
+    real y = prev_out_y[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (xy[0] == 0) {
+    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
+    for (int index = tid; index < width; index += block_size) {
+      prev_grad_x[index] +=
+        scale * grad[ty] * prev_out_y[index] * reciprocal;
+      if (input2_height > 1) {
+        prev_grad_y[index] +=
+          scale * grad[ty] * prev_out_x[index] * reciprocal;
+      } else {
+        paddle::paddleAtomicAdd(prev_grad_y + index,
+          scale * grad[ty] * prev_out_x[index] * reciprocal);
+      }
+    }
+  } else {
+    real reciprocalXY = 1.0 / xy[0];
+    real reciprocalSquareSumX = 1.0 / xx[0];
+    real reciprocalSquareSumY = 1.0 / yy[0];
+    for (int index = tid; index < width; index += block_size) {
+      prev_grad_x[index] += output[ty] * grad[ty] *
+        (prev_out_y[index] * reciprocalXY -
+         prev_out_x[index] * reciprocalSquareSumX);
+      if (input2_height > 1) {
+        prev_grad_y[index] += output[ty] * grad[ty] *
+          (prev_out_x[index] * reciprocalXY -
+           prev_out_y[index] * reciprocalSquareSumY);
+      } else {
+        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
+          (prev_out_x[index] * reciprocalXY -
+           prev_out_y[index] * reciprocalSquareSumY));
+      }
+    }
+  }
+}
+
+void hlCossimDerivative(const real* grad,
+                        const real* output,
+                        const real* prev_out_x,
+                        const real* prev_out_y,
+                        real* prev_grad_x,
+                        real* prev_grad_y,
+                        size_t width,
+                        size_t input1_height,
+                        size_t input2_height,
+                        real scale) {
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(prev_out_x);
+  CHECK_NOTNULL(prev_out_y);
+  CHECK_NOTNULL(prev_grad_x);
+  CHECK_NOTNULL(prev_grad_y);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
+        input1_height, input2_height, scale);
+  CHECK_SYNC("hlCossimDerivate failed");
+}
+
+template <>
+void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                     const GpuMatrix& out_val,
+                                     const GpuMatrix& in1_val,
+                                     const GpuMatrix& in2_val,
+                                     GpuMatrix& in1_grad,
+                                     GpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
+        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+        << "Matrix types are not equally GPU";
+
+  size_t dim = in1_val.getWidth();
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+  hlCossimDerivative(grad,
+                     out,
+                     prev_out_x,
+                     prev_out_y,
+                     prev_grad_x,
+                     prev_grad_y,
+                     dim,
+                     in1_val.getHeight(),
+                     in2_val.getHeight(),
+                     scale);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48c815f027161b48c17ce654ab819156fd856199
--- /dev/null
+++ b/paddle/function/CosSimOpTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
+                  ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+void testCosSimBackward(size_t height_x,
+                        size_t height_y,
+                        size_t width,
+                        real scale) {
+  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
+                  ADD_TO);
+  // run Function
+  test.run();
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+          testCosSimBackward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 96a7a30eebbf0f01fa89ea91110ddb826fd2f64b..ef878bfbba961bdd3d5212e19fb83bb1e285e47f 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -112,49 +112,114 @@ void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
 }
 
 /**
- * \param inputs[0] input value.
- * \param outputs[0] output value.
- * \param outputs[1] denoms.
+ * \brief Normalization with across maps.
+ *
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
+ *
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ *
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * Note:
+ * Save output[1] is to simplify the backward calculation.
+ * TODO, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
  */
 template <DeviceType Device>
 class CrossMapNormalFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
+    // function arguments
     size_ = config.get<size_t>("size");
     scale_ = config.get<real>("scale");
     pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 2;
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(1, static_cast<int>(inputs.size()));
-    CHECK_EQ(2, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
-    }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    // ArgType check still on here,
+    // not sure whether it is better to put inside the check.
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
 
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
-
-    CrossMapNormal<Device>(outputs[0].getData(),
-                           outputs[1].getData(),
-                           inputs[0].getData(),
-                           samples,
-                           channels,
-                           height,
-                           width,
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
+                           batchSize,
+                           maps,
+                           rows,
+                           columns,
                            size_,
                            scale_,
                            pow_);
   }
 
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+  }
+
+  // Only need the shape of the input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)numInputs_, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
+
+    return ops;
+  }
+
 private:
   size_t size_;
   real scale_;
@@ -162,55 +227,107 @@ private:
 };
 
 /**
- * \param inputs[0] input value.
- * \param inputs[1] output value.
- * \param inputs[2] output grad.
- * \param inputs[3] denoms.
- * \param outputs[0] input grad.
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
  */
 template <DeviceType Device>
 class CrossMapNormalGradFunc : public FunctionBase {
 public:
   void init(const FuncConfig& config) override {
+    // function arguments
     size_ = config.get<size_t>("size");
     scale_ = config.get<real>("scale");
     pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 4;
+    numOutputs_ = 1;
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(4, static_cast<int>(inputs.size()));
-    CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
     }
 
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].getData(),
-                               inputs[0].getData(),
-                               inputs[1].getData(),
-                               inputs[2].getData(),
-                               inputs[3].getData(),
-                               samples,
-                               channels,
-                               height,
-                               width,
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
+                               batchSize,
+                               maps,
+                               rows,
+                               columns,
                                size_,
                                scale_,
                                pow_);
   }
 
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+  }
+
+  // Only need the shape of one input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_LT((size_t)1, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
+
+    return ops;
+  }
+
 private:
   size_t size_;
   real scale_;
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index d65d9310affd7c9b7fee3118c79449870849c243..51f5da81bfc9ae870ac9949ba74da01a9449a04d 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -27,15 +27,19 @@ TEST(CrossMapNormal, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormal",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
-                               {});
+            // init Test object
+            FunctionCompare test("CrossMapNormal",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            // prepare input arguments
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
@@ -53,18 +57,19 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormalGrad",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims)},
-                               {});
+            FunctionCompare test("CrossMapNormalGrad",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 614e76b8ac0c9a9145a27f5b532ea63bef7f90f0..f71c0f681b3bc524ba96c55f1dcad30ef59478c8 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -16,64 +16,28 @@ limitations under the License. */
 
 namespace paddle {
 
-template <>
-size_t FuncConfig::get<size_t>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.s;
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
 }
 
-template <>
-real FuncConfig::get<real>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.r;
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
-template <>
-int FuncConfig::get<int>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.i;
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
-template <>
-bool FuncConfig::get<bool>(const std::string& key) const {
-  auto it = valueMap_.find(key);
-  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
-  return it->second.b;
-}
-
-template <>
-FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].s = v;
-  return *this;
-}
-
-template <>
-FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].r = v;
-  return *this;
-}
-
-template <>
-FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].i = v;
-  return *this;
-}
-
-template <>
-FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
-  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
-                                                      << key;
-  valueMap_[key].b = v;
-  return *this;
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 9e8cbb8e48c30e80c5057fc53c050b67d3957188..15eb35b7f7dac1b98f2d8694707d83b84bda0f2e 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -16,87 +16,190 @@ limitations under the License. */
 
 #include <map>
 #include <vector>
+#include "BufferArg.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Any.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2,
-};
-
-template <DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-template <DeviceType Device>
-struct SequenceT;
-
-template <>
-struct SequenceT<DEVICE_TYPE_CPU> {
-  using type = CpuIVector;
-};
-
-template <>
-struct SequenceT<DEVICE_TYPE_GPU> {
-  using type = GpuIVector;
-};
-
-typedef std::vector<size_t> Dims;
-
-class Tensor {
-public:
-  Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {}
-
-  real* getData() const { return buf_; }
-
-  real* buf_;
-  Dims dims_;
-};
-
-typedef std::vector<Tensor> Arguments;
-
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ */
 class FuncConfig {
 public:
-  union value {
-    size_t s;
-    real r;
-    int i;
-    bool b;
-  };
-
   template <typename T>
-  T get(const std::string& key) const;
+  T get(const std::string& key, Error* err = nullptr) const {
+    try {
+      return any_cast<T>(valueMap_.at(key));
+    } catch (std::exception& e) {  // could be cast or out of range exception.
+      if (err) {
+        *err = Error(e.what());
+      } else {
+        LOG(FATAL) << "Cannot get key " << key << " with error " << e.what();
+      }
+      return T();
+    }
+  }
 
   template <typename T>
-  FuncConfig& set(const std::string& key, T v);
+  FuncConfig& set(const std::string& key, T v, Error* err = nullptr) {
+    auto it = valueMap_.find(key);
+    if (it != valueMap_.end()) {  // already contains key.
+      if (err) {
+        *err = Error("Key %s is already set in FuncConfig", key.c_str());
+      } else {
+        LOG(FATAL) << "Key " << key << " is already set in FuncConfig.";
+      }
+      return *this;
+    }
+    valueMap_[key] = any(v);
+    return *this;
+  }
 
 protected:
-  std::map<std::string, value> valueMap_;
+  mutable std::unordered_map<std::string, any> valueMap_;
+};
+
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
+ */
+class BufferArgs {
+public:
+  BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
+private:
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
 };
 
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
 class FunctionBase {
 public:
   virtual ~FunctionBase() {}
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const Arguments& inputs,
-                    const Arguments& outputs,
-                    const Arguments& inouts) {}
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // This member function is used to check whether the BufferType and shape of
+  // the inputs and outputs arguments of the Function are correct.
+  // General calc function which will call this check to do arguments check.
+  // And before the calc called, the caller can also check their own arguments.
+  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // Calculate the number of floating-point operations of this Function.
+  // The inputs and outputs arguments do not need to contain the actual data,
+  // only the shape.
+  // And some Functions have the same input and output shapes,
+  // so you may not need to enter the complete number of arguments.
+  // But entering the full arguments is always correct for this interface.
+  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
+    return 0;
+  }
+
+  int getNumInputs() const { return numInputs_; }
+
+  int getNumOutputs() const { return numOutputs_; }
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
+
+protected:
+  // numInputs_ and numOutputs_ represents the maximum
+  // input and output supported by Function.
+  // Some functions are optimized for input and output,
+  // so when comparing the number of arguments, for these functions
+  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
+  size_t numInputs_;
+  size_t numOutputs_;
 };
 
 #define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdf7e631e5ab8c67eb5cf906bd0af49740d60112
--- /dev/null
+++ b/paddle/function/FunctionTest.cpp
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 32131037f6de4a9f7a3ebf8f5773eccd65dc2cdb..0cfafdb27f55a3e6617d31a968d2a05fc77f5b46 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -13,97 +13,336 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Function.h"
-#include "paddle/math/Vector.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 
 namespace paddle {
 
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
-    gpu->init(config);
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpuFunc_->init(config);
+    gpuFunc_->init(config);
   }
 
-  void cmpWithArg(const Arguments& inputs,
-                  const Arguments& outputs,
-                  const Arguments& inouts) {
-    // init cpu and gpu arguments
-    auto initArgs = [=](
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (const auto arg : inArgs) {
-        size_t size = sizeof(real);
-        for (const auto dim : arg.dims_) {
-          size *= dim;
-        }
-        if (arg.getData()) {
-          // todo(tianbing), waste unnecessary mem here
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          // already init outside
-        } else {
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(
-              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-          gpuArgs.emplace_back(
-              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-          // will use an api to refactor this code.
-          CpuVector cpuVector(size / sizeof(real),
-                              (real*)cpuArgs.back().getData());
-          GpuVector gpuVector(size / sizeof(real),
-                              (real*)gpuArgs.back().getData());
-          cpuVector.uniform(0.001, 1);
-          gpuVector.copyFrom(cpuVector);
-        }
-      }
-    };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
+  ~FunctionCompare() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // assume one copy of sequence is shared by different SequenceArgs
+  void addSequence(const SequenceIdArg& input) {
+    CHECK_EQ(input.shape().ndims(), 1UL);
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
+                                              TensorShape{numSeqs + 1});
+    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
+                                              TensorShape{numSeqs + 1});
+    /// init sequence Id
+    initArg(*cpuSeq_, batchSize);
+
+    // todo(tianbing), delete it
+    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
+
+    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
+    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
+    gpuSeq.copyFrom(cpuSeq);
+  }
+
+  void addInputs(const SequenceArg& input) {
+    CHECK_EQ(input.shape().ndims(), 2UL);
+    size_t batchSize = input.shape()[0];
+    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    /// SequenceArg
+    cpuInputs_.emplace_back(
+        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *cpuSeq_));
+    gpuInputs_.emplace_back(
+        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *gpuSeq_));
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+  }
+
+  /// add and init output sparse matrix
+  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
+    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    cpuSparse_->randomizeUniform();
+    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    hl_stream_synchronize(stream);
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+  }
+
+  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
+    CHECK_EQ(output.shape().ndims(), 2UL);
+    size_t batchSize = output.shape()[0];
+
+    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    /// SequenceArg
+    cpuOutputs_.emplace_back(
+        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *cpuSeq_,
+                                      argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *gpuSeq_,
+                                      argType));
+  }
+
+  void addInputs(const SparseMatrixArg& input) {
+    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    cpuSparse_->randomizeUniform();
+    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    hl_stream_synchronize(stream);
 
+    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
+    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
+
+    initOutputs();
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
-
-    // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
-          size *= dim;
-        }
-        CpuVector cpuVector(size, (real*)cpu.getData());
-        GpuVector gpuVector(size, (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
+      }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
       }
+      function->calc(inArgs, outArgs);
     };
-    checkArgs(cpuOutputs, gpuOutputs);
-    checkArgs(cpuInouts, gpuInouts);
+
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+
+    // check outputs
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+
+protected:
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceArg& arg) {
+    /// init only matrix
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = 1 + uniformRandom(std::min<int64_t>(
+                        maxLen, batchSize - pos - numSeqs + i));
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
+  void initInputs() {
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      if (cpuInputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (cpuInputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+      } else {
+        initArg(*cpuInputs_[i]);
+      }
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
+                          (real*)cpuInputs_[i]->data());
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
+
+      gpuVector.copyFrom(cpuVector);
+    }
   }
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  void initOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      if (cpuOutputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (cpuOutputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+      } else {
+        initArg(*cpuOutputs_[i]);
+      }
+
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
+                          (real*)cpuOutputs_[i]->data());
+      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
+                          (real*)gpuOutputs_[i]->data());
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      const auto cpu = cpuOutputs_[i];
+      const auto gpu = gpuOutputs_[i];
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
+      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
+      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
+  }
 
 protected:
-  std::shared_ptr<FunctionBase> cpu;
-  std::shared_ptr<FunctionBase> gpu;
-  std::vector<CpuMemHandlePtr> cpuMemory;
-  std::vector<GpuMemHandlePtr> gpuMemory;
-  Arguments cpuInputs;
-  Arguments cpuOutputs;
-  Arguments cpuInouts;
-  Arguments gpuInputs;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
+  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> gpuOutputs_;
+  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
+  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
+  std::shared_ptr<SequenceIdArg> cpuSeq_;
+  std::shared_ptr<SequenceIdArg> gpuSeq_;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91b4b8ed91b6055babcfbab8f7adb2c55e2747d0
--- /dev/null
+++ b/paddle/function/MulOp.cpp
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+/// todo(tianbing), delete it
+#include <iostream>
+#include "paddle/math/MathFunctions.h"
+#include "paddle/math/SIMDFunctions.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define GEMM paddle::gemm<float>
+#else
+#define GEMM paddle::gemm<double>
+#endif
+
+namespace {
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
+  }
+}
+}  // namespace
+
+namespace paddle {
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* A = a.getData();
+  const real* B = b.getData();
+  real* C = out.getValue();
+  int* rows = out.getRows();
+  int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
+
+  /// SPARSE_CSC, {a any, b not trans}
+  if (out.getFormat() == SPARSE_CSC) {
+    /// b not trans and a any
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
+    for (size_t i = 0; i < width; i++) {
+      size_t start = out.getColStartIdx(i);
+      size_t end = out.getColStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t rowIdx = rows[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
+  if (out.getFormat() == SPARSE_CSR) {
+    /// a and b can not both transpose
+    CHECK(!(aTrans && bTrans));
+    size_t m = a.getWidth();
+    for (size_t i = 0; i < height; i++) {
+      size_t start = out.getRowStartIdx(i);
+      size_t end = out.getRowStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t colIdx = cols[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+}
+
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  GEMM(aTrans ? CblasTrans : CblasNoTrans,
+       bTrans ? CblasTrans : CblasNoTrans,
+       out.getHeight(),
+       out.getWidth(),
+       !aTrans ? a.getWidth() : a.getHeight(),
+       scaleAB,
+       a.getData(),
+       a.getStride(),
+       b.getData(),
+       b.getStride(),
+       scaleT,
+       out.getData(),
+       out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuSparseMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* B = b.getData();
+  real* C = out.getData();
+  if (out.getWidth() % 32 == 0) {
+    CHECK_EQ((size_t)B % 32, 0UL);
+    CHECK_EQ((size_t)C % 32, 0UL);
+  }
+
+  int* cols = a.getCols();
+  real* values = a.getValue();
+  for (size_t i = 0; i < a.getHeight(); ++i) {
+    const int start = a.getRowStartIdx(i);
+    const int end = a.getRowStartIdx(i + 1);
+    for (int j = start; j < end; ++j) {
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
+               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
+               out.getWidth());
+    }
+  }
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  real* A = const_cast<real*>(a.getData());
+  real* B = const_cast<real*>(b.getValue());
+  real* C = out.getData();
+  int* rows = b.getRows();
+  int* cols = b.getCols();
+
+  /// SPARSE_CSC format
+  if (b.getFormat() == SPARSE_CSC) {
+    for (size_t j = 0; j < b.getWidth(); ++j) {
+      int start = b.getColStartIdx(j);
+      int end = b.getColStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR format
+  if (b.getFormat() == SPARSE_CSR) {
+    for (size_t j = 0; j < b.getHeight(); ++j) {
+      int start = b.getRowStartIdx(j);
+      int end = b.getRowStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+}
+
+/**
+ * mul operator
+ * out = scaleT * out + scaleAB * (A * B)
+ * here, scaleT in {0, 1}, scaleAB == 1,
+ * out = A * B, ASSIGN_TO
+ * out += A * B, ADD_TO
+ *
+ *
+ * \param outputs[0]      output matrix (out), M * N,
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, N is num of columns
+ * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, K is num of columns
+ * \param inputs[1]       second input matrix (B), K * N (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        K is num of rows, N is num of columns
+ *
+ * Support eight Mul operators, with both GPU and CPU devices
+ * For each device, four Mul operators are supported:
+ * 1. dense (out) = dense (A) * dense (B)
+ * 2. dense (out) = sparse (A) * dense (B)
+ *    sparse matrix only support SPARSE_CSR format
+ * 3. dense (out) = dense (A) * sparse (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ * 4. sparse (out) = dense (A) * dense (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ *
+ */
+template <DeviceType Device>
+class MulFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
+    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
+
+    auto outMat = outputs[0].matrix<Device>();
+    /// dense matrix = dense matrix * dense matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = dense matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = sparse matrix * dense matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
+      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
+          << "Only supported SPARSE_CSR format for sparse matrix a";
+      MulOp<Device>(outMat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// sparse matrix = dense matrix * dense matrix
+    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(outSparseMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+  }
+
+private:
+  bool aTrans_;
+  bool bTrans_;
+};
+
+REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6016a6ab6e9d6549b359573ecc2b33900a58365
--- /dev/null
+++ b/paddle/function/MulOp.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+/// CPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dcfcb2325d7dae22e0e0e78fc0bddf061fc0940c
--- /dev/null
+++ b/paddle/function/MulOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "MulOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_mul(const_cast<real*>(a.getData()),
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(b.getData()),
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(out.getData()),
+                out.getHeight(),
+                out.getWidth(),
+                !aTrans ? a.getWidth() : a.getHeight(),
+                scaleAB,
+                scaleT,
+                a.getStride(),
+                b.getStride(),
+                out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuSparseMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
+                          const_cast<real*>(b.getData()),
+                          HPPL_OP_N,
+                          const_cast<real*>(out.getData()),
+                          out.getHeight(),
+                          out.getWidth(),
+                          b.getHeight(),
+                          scaleAB,
+                          scaleT);
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  }
+}
+
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       const_cast<real*>(b.getData()),
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       out.sMatrix_.get(),
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8753057ebf73c99336b2f5d9c610e4aaf293f845
--- /dev/null
+++ b/paddle/function/MulOpTest.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/tests/test_matrixUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+/**
+ *  C += A * B, A, B, C dense matrix
+ *  dense = dense * dense
+ */
+void testFuncDDDMatrix(
+    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
+  real scaleT = 1.0;
+  size_t heightA = (transa == false) ? dimM : dimK;
+  size_t widthA = (transa == false) ? dimK : dimM;
+  size_t heightB = (transb == false) ? dimK : dimN;
+  size_t widthB = (transb == false) ? dimN : dimK;
+  size_t heightC = dimM;
+  size_t widthC = dimN;
+  // init Test object
+  FunctionCompare test(
+      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
+  // prepare input arguments
+  /// matrix A : HA * WA
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
+  /// matrix B: HB * WB
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
+
+  /// output matrix C: HC * WC
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDDMatrixMul) {
+  LOG(INFO) << "function test for dense = dense * dense matrix";
+  for (const auto transa : {false, true}) {
+    for (const auto transb : {false, true}) {
+      for (const auto dimM : {1, 10, 100}) {
+        for (const auto dimN : {1, 10}) {
+          for (const auto dimK : {8}) {
+            if (transa && transb) {
+              continue;
+            }
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, B, C dense, A sparse
+ * dense = sparse * dense
+ */
+void testFuncDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// sparse matrix A : M * K
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MuLOp, DSparseDMul) {
+  LOG(INFO) << "function test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A, C dense, B sparse
+ * dense = dense * sparse
+ */
+void testFuncDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDSparseMul) {
+  LOG(INFO) << "function test for dense = dense * sparse matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * C += A * B, A sparse, B, C dense
+ * sparse = dense * dense
+ */
+void testFuncSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output sparse matrix C: M * N
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, SparseDDMul) {
+  LOG(INFO) << "function test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..adba7c92ece505eecc74edce6b393cf27fa10ccc
--- /dev/null
+++ b/paddle/function/PadOp.cpp
@@ -0,0 +1,215 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+static inline PadConf castToPadConf(const FuncConfig& conf) {
+  return {conf.get<std::vector<uint32_t>>("channel"),
+          conf.get<std::vector<uint32_t>>("height"),
+          conf.get<std::vector<uint32_t>>("width")};
+}
+
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicate
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
+                num,
+                inC,
+                inH,
+                inW,
+                pad_);
+  }
+
+private:
+  PadConf pad_;
+};
+
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override { pad_ = castToPadConf(config); }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
+                    inC,
+                    inH,
+                    inW,
+                    pad_);
+  }
+
+private:
+  PadConf pad_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..0e226ec7370b9897ebdc697ee528b90a37e4ec56
--- /dev/null
+++ b/paddle/function/PadOp.h
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+struct PadConf {
+  /// how many values to add before/after the data along channel dimension.
+  std::vector<uint32_t> channel;
+  /// how many values to add before/after the data along height dimension.
+  std::vector<uint32_t> height;
+  /// how many values to add before/after the data along width dimension.
+  std::vector<uint32_t> width;
+};
+
+/**
+ * \brief  This funtion pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according argument of pad.
+ *
+ * \param[out] outputs save results.
+ * \param[in]  inputs  input data.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
+ */
+template <DeviceType Device>
+void Pad(real* outputs,
+         const real* inputs,
+         const int num,
+         const int inC,
+         const int inH,
+         const int inW,
+         const PadConf& pad);
+
+/**
+ * \brief   Padding operation backward.
+ *
+ * \param[out] inGrad  gradients of previous layer.
+ * \param[in]  outGrad output gradients.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
+ */
+template <DeviceType Device>
+void PadGrad(real* inGrad,
+             const real* outGrad,
+             const int num,
+             const int inC,
+             const int inH,
+             const int inW,
+             const PadConf& pad);
+}  // namespace paddle
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9094f1528433fdcaad3397a991aa8ac6fa04bc01
--- /dev/null
+++ b/paddle/function/PadOpGpu.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "PadOp.h"
+
+namespace paddle {
+
+__global__ void KePad(real* outputs, const real* inputs,
+                      int inC, int inH, int inW,
+                      int padc, int padh, int padw,
+                      int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    outputs[off] = inputs[idx];
+  }
+}
+
+template <>
+void Pad<DEVICE_TYPE_GPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
+     outC, outH, outW, nth);
+  CHECK_SYNC("Pad");
+}
+
+__global__ void KePadDiff(real* inGrad, const real* outGrad,
+                          int inC, int inH, int inW,
+                          int padc, int padh, int padw,
+                          int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    inGrad[idx] += outGrad[off];
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int cstart = pad.channel[0], cend = pad.channel[1];
+  int hstart = pad.height[0], hend = pad.height[1];
+  int wstart = pad.width[0], wend = pad.width[1];
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
+     outC, outH, outW, nth);
+  CHECK_SYNC("PadGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f77ac2a8c49c83f2d6c64c2a30b6a2f2eb09ac10
--- /dev/null
+++ b/paddle/function/PadOpTest.cpp
@@ -0,0 +1,49 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Pad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          for (bool test_grad : {false, true}) {
+            FunctionCompare compare(
+                test_grad ? "PadGrad" : "Pad",
+                FuncConfig()
+                    .set<std::vector<uint32_t>>("channel", {2, 3})
+                    .set<std::vector<uint32_t>>("height", {1, 2})
+                    .set<std::vector<uint32_t>>("width", {3, 2}));
+            TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+            TensorShape outDims{
+                numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+            compare.addInputs(
+                BufferArg(VALUE_TYPE_FLOAT, test_grad ? outDims : inDims));
+            compare.addOutputs(BufferArg(
+                VALUE_TYPE_FLOAT, test_grad ? inDims : outDims, ASSIGN_TO));
+            compare.run();
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda58f19dfa4a8b80efc97570c83ca38fd7adf27
--- /dev/null
+++ b/paddle/function/TensorShape.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    dims_.assign(dims);
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    dims_.assign(t.dims_.begin(), t.dims_.end());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  void reshape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    if (ndims_ > kMinDims) {
+      dims_.resize(ndims_);
+    }
+    dims_.assign(dims);
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+
+    return true;
+  }
+
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < kMinDims ? kMinDims : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+  static const size_t kMinDims = 4;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..45a2e106e7fc3f0e9e57cf8c2bb549d747f4f49b
--- /dev/null
+++ b/paddle/function/TensorShapeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorShape.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
new file mode 100644
index 0000000000000000000000000000000000000000..8308bbd8ad4fe1b97b35b779f27d2bf4534f0fa6
--- /dev/null
+++ b/paddle/function/TensorType.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
+
+enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
+namespace detail {
+
+template <typename VType, DeviceType Device>
+struct MatrixT;
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct SparseMatrixT;
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
+
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e50e46f3e99111731d9587f3e4ddfd4b26ae27e9
--- /dev/null
+++ b/paddle/function/TensorTypeTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorType.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorType, Matrix) {
+  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
+  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.useGpu(), false);
+
+  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
+  EXPECT_EQ(testGpu.useGpu(), true);
+}
+
+TEST(TensorType, Vector) {
+  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
+  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
+  EXPECT_EQ(cpuVector.useGpu(), false);
+  EXPECT_EQ(gpuVector.useGpu(), true);
+  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100);
+
+  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
+  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
+  EXPECT_EQ(cpuIVector.useGpu(), false);
+  EXPECT_EQ(gpuIVector.useGpu(), true);
+  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100);
+}
+
+TEST(TensorType, EmptyMatrix) {
+  CpuMatrix empty(nullptr, 0, 0);
+  CpuMatrix nonEmpty(10, 10);
+  EXPECT_EQ(empty.isEmpty(), true);
+  EXPECT_EQ(nonEmpty.isEmpty(), false);
+  CHECK(nonEmpty);
+  auto function = [](const CpuMatrix& matrix) {
+    if (matrix) {
+      EXPECT_NE(matrix.getData(), nullptr);
+    } else {
+      EXPECT_EQ(matrix.getData(), nullptr);
+    }
+  };
+  function(empty);
+  function(nonEmpty);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 4f92150ec84d637c5b75cba09d7e98501a5a5f5d..93a6a99848aa13bb36c9c5c7091fbaa891fc9823 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -25,12 +25,16 @@ filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
+        layers/CudnnConvBaseLayer.h
         layers/CudnnConvLayer.h
+        layers/CudnnConvTransLayer.h
         layers/CudnnPoolLayer.h
         layers/CudnnBatchNormLayer.h)
 
     list(REMOVE_ITEM GSERVER_SOURCES
+        layers/CudnnConvBaseLayer.cpp
         layers/CudnnConvLayer.cpp
+        layers/CudnnConvTransLayer.cpp
         layers/CudnnPoolLayer.cpp
         layers/CudnnBatchNormLayer.cpp)
     compile_cu_as_cpp(layers/LstmCompute.cu)
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f8c4bcac2f8eb41400659dc24ba81768e7ae3640..c541b72e104bf2b81e2ac222d4af13ea2f90d289 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -69,8 +69,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
 class IdentityActivation : public ActivationFunction {
 public:
   static const std::string name;
-  void forward(Argument& act) { (void)act; }
-  void backward(Argument& act) { (void)act; }
+  Error __must_check forward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    (void)act;
+    return Error();
+  }
   const std::string& getName() const { return name; }
 };
 const std::string IdentityActivation::name = "";
@@ -86,8 +92,14 @@ static InitFunction __reg_activation__identity([] {
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(sigmoid)
-void forward(Argument& act) { act.value->sigmoid(*act.value); }
-void backward(Argument& act) { act.grad->sigmoidDerivative(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(sigmoid)
 
 /**
@@ -103,9 +115,12 @@ MatrixPtr sftMaxDot_;
 MatrixPtr one_;
 
 public:
-void forward(Argument& act) { act.value->softmax(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   MatrixPtr outputV = act.value;
   MatrixPtr outputG = act.grad;
 
@@ -137,6 +152,7 @@ void backward(Argument& act) {
 
     act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
   }
+  return Error();
 }
 END_DEFINE_ACTIVATION(softmax)
 
@@ -151,8 +167,11 @@ ACTIVATION_CLASS_NAME(softmax) softmax_;
 Argument argument_;
 
 public:
-void forward(Argument& act) {
-  CHECK_EQ(act.value->getWidth(), 1UL);
+Error __must_check forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   if (!argument_.value) {
     argument_.value = Matrix::create(nullptr,
@@ -169,10 +188,14 @@ void forward(Argument& act) {
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
+  return Error();
 }
 
-void backward(Argument& act) {
-  CHECK_EQ(act.grad->getWidth(), 1UL);
+Error __must_check backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   size_t numSequences = act.getNumSequences();
   const int* starts = act.sequenceStartPositions->getData(false);
@@ -184,8 +207,10 @@ void backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    softmax_.backward(argument_);
+    Error status = softmax_.backward(argument_);
+    if (!status) return status;
   }
+  return Error();
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
@@ -200,9 +225,15 @@ END_DEFINE_ACTIVATION(sequence_softmax)
  *    0 otherwise.
  */
 BEGIN_DEFINE_ACTIVATION(relu)
-void forward(Argument& act) { act.value->relu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->reluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(relu)
 
 /**
@@ -219,9 +250,15 @@ END_DEFINE_ACTIVATION(relu)
  * TODO(yuyang18): Remove magic number 24 or make it configuable.
  */
 BEGIN_DEFINE_ACTIVATION(brelu)
-void forward(Argument& act) { act.value->brelu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->breluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(brelu)
 
 /**
@@ -231,9 +268,15 @@ END_DEFINE_ACTIVATION(brelu)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(tanh)
-void forward(Argument& act) { act.value->tanh(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->tanhDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(tanh)
 
 /**
@@ -248,10 +291,14 @@ real a, b;
 
 public:
 ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-void forward(Argument& act) { act.value->scaledTanh(*act.value, a, b); }
+Error __must_check forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Error();
+}
 
-void backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Error();
 }
 END_DEFINE_ACTIVATION(stanh)
 
@@ -262,9 +309,15 @@ END_DEFINE_ACTIVATION(stanh)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(softrelu)
-void forward(Argument& act) { act.value->softrelu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->softreluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(softrelu)
 
 /**
@@ -280,7 +333,7 @@ END_DEFINE_ACTIVATION(softrelu)
  *     0   if z=0
  */
 BEGIN_DEFINE_ACTIVATION(abs)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -290,9 +343,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->abs2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->absDerivative(*act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(abs)
 
 /**
@@ -302,7 +359,7 @@ END_DEFINE_ACTIVATION(abs)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(square)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -312,9 +369,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->square2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(square)
 
 /**
@@ -324,9 +385,15 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp2(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->expDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(exponential)
 
 /**
@@ -336,7 +403,7 @@ END_DEFINE_ACTIVATION(exponential)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(log)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -346,9 +413,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->log2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 601e3b6c0cd401ec007e8cf51e44416f82832e58..f208224e304a79125679c6f3a5c0be09552465ef 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -48,7 +49,7 @@ public:
    *
    * Usually, act is Layer::output_
    */
-  virtual void forward(Argument& act) = 0;
+  virtual Error __must_check forward(Argument& act) = 0;
 
   /**
    * @brief Backward propagaion
@@ -57,7 +58,7 @@ public:
    * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
    * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
    */
-  virtual void backward(Argument& act) = 0;
+  virtual Error __must_check backward(Argument& act) = 0;
 
   virtual const std::string& getName() const = 0;
 };
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 9a2ad7567f0dc93d0a8e396fd88b2488afe9d049..40036762179ebb1495b90907f16b97e3c60c50d8 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -164,15 +164,6 @@ public:
     argu.value = value;
     data_.push_back(argu);
   }
-  /**
-   * @brief Append user defined data
-   * @param[in]  ptr     user defined data
-   */
-  void appendUserDefinedPtr(UserDefinedVectorPtr ptr) {
-    Argument argu;
-    argu.udp = ptr;
-    data_.push_back(argu);
-  }
 
   /*
    * @brief Append argument
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index c26e242534f2afcff396762adb085bf99303e2b5..b8079dc0796d0e300e65ac6b6b8d3bc826b1e504 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -647,7 +647,7 @@ public:
       DataBatch& gpuBatch = *batch;
       std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
       gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(size);
+      gpuBatch.setSize(bsize);
       for (size_t i = 0; i < headers_.size(); ++i) {
         gpuArguments[i].resizeAndCopyFrom(
             cpuArguments[i], useGpu_, HPPL_STREAM_1);
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 05aa6c012ae2bc0afcbaf23f8ff78b3c782d050c..132119015f967c6e8d055792de8afe8450df5ec6 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * calculate sequence-to-sequence edit distance
  */
-class CTCErrorEvaluator : public Evaluator {
+class CTCErrorEvaluator : public NotGetableEvaluator {
 private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index ae7508e2bb117a60492e0c28230f2fbb4b14915e..9db6d252d97bfeee3fe376bcda431fe94c65a678 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/utils/Stat.h"
-
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DECLARE_int32(trainer_id);
 
@@ -39,6 +39,14 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  */
 class ClassificationErrorEvaluator : public Evaluator {
 public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     if (3 == arguments.size()) {
       numSamples_ += arguments[2].value->getSum();
@@ -76,9 +84,11 @@ public:
                                               1,
                                               /* trans= */ false,
                                               useGpu(arguments[0].deviceId));
+
     errorMat->zeroMem();
+
     if (label != nullptr) {
-      errorMat->classificationError(*output, *label);
+      errorMat->classificationError(*output, *label, config_.top_k());
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
       errorMat->classificationErrorMulti(
@@ -94,6 +104,16 @@ public:
     return errorMat;
   }
 
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
   virtual real evalImp(std::vector<Argument>& arguments) {
     MatrixPtr errorMat = calcError(arguments);
     return errorMat->getSum();
@@ -102,6 +122,10 @@ public:
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "classification_error"; }
 };
 
 /**
@@ -140,6 +164,10 @@ public:
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
                    SequenceClassificationErrorEvaluator);
@@ -230,6 +258,10 @@ public:
 private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "sum"; }
 };
 /**
  * @brief column sum Evaluator
@@ -337,10 +369,18 @@ public:
   }
 
 private:
-  ColumnSumEvaluator() {}
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
 };
 
 void AucEvaluator::start() {
@@ -449,6 +489,16 @@ double AucEvaluator::calcAuc() const {
   }
 }
 
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
 // class RankAucEvaluator
 REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
 
@@ -528,12 +578,15 @@ double RankAucEvaluator::calcRankAuc(real* outputData,
                                         : aucTmp / (clickSum * noClickSum);
 }
 
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
 // class PrecisionRecallEvaluator
 REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
 
 void PrecisionRecallEvaluator::start() {
   Evaluator::start();
   statsInfo_.clear();
+  values_.clear();
 }
 
 real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
@@ -594,52 +647,23 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
 }
 
 void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    double precision =
-        calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    double recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    os << "positive_label=" << label << " precision=" << precision
-       << " recall=" << recall
-       << " F1-score=" << calcF1Score(precision, recall);
-    return;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  double macroAvgPrecision = 0;
-  double macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    macroAvgPrecision += calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  macroAvgPrecision /= numLabels;
-  macroAvgRecall /= numLabels;
-  double macroAvgF1Score = calcF1Score(macroAvgPrecision, macroAvgRecall);
-  os << "macro-average-precision=" << macroAvgPrecision
-     << " macro-average-recall=" << macroAvgRecall
-     << " macro-average-F1-score=" << macroAvgF1Score;
-
-  double microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  double microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  double microAvgF1Score = calcF1Score(microAvgPrecision, microAvgRecall);
-  if (!isMultiBinaryLabel_) {
-    // precision and recall are equal in this case
-    os << " micro-average-precision=" << microAvgPrecision;
-  } else {
-    os << " micro-average-precision=" << microAvgPrecision
-       << " micro-average-recall=" << microAvgRecall
-       << " micro-average-F1-score=" << microAvgF1Score;
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
   }
 }
 
@@ -721,6 +745,60 @@ void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
   }
 }
 
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
 void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   size_t size = 4 * statsInfo_.size();
   double* buf = new double[size];
@@ -740,6 +818,47 @@ void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   delete[] buf;
 }
 
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
 REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
 void PnpairEvaluator::start() {
   Evaluator::start();
@@ -864,56 +983,35 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
             << " calc total special pair: " << special;
 }
 
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
 ClassRegistrar<Evaluator> Evaluator::registrar_;
 Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = nullptr;
-  if (config.type() == "classification_error") {
-    evaluator = new ClassificationErrorEvaluator();
-  } else if (config.type() == "sum") {
-    evaluator = new SumEvaluator();
-  } else if (config.type() == "last-column-sum") {
-    evaluator = new ColumnSumEvaluator(-1);
-  } else if (config.type() == "last-column-auc") {
-    evaluator = new AucEvaluator(-1);
-  } else {
-    evaluator = registrar_.createByType(config.type());
-  }
+  Evaluator* evaluator = registrar_.createByType(config.type());
   evaluator->init(config);
   return evaluator;
 }
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
 /**
  * @brief print value of each layer.
  *
  * The config file api is value_printer_evaluator.
  */
-class ValuePrinter : public Evaluator {
+class ValuePrinter : public NotGetableEvaluator {
 public:
-  ValuePrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        std::ostringstream os;
-        argu.value->print(os);
-        LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-      }
-      if (argu.ids) {
-        std::ostringstream os;
-        argu.ids->print(os, argu.ids->getSize());
-        LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-      }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
-      if (auto subStartPos = argu.subSequenceStartPositions) {
-        std::ostringstream os;
-        subStartPos->getVector(false)->print(os, subStartPos->getSize());
-        LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                  << os.str();
-      }
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
     }
   }
 
@@ -922,15 +1020,14 @@ public:
   virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
 };
 REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
 /**
  * @brief print gradient of each layer.
  *
  * The config file api is gradient_printer_evaluator.
  */
-class GradientPrinter : public Evaluator {
+class GradientPrinter : public NotGetableEvaluator {
 public:
-  GradientPrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -939,11 +1036,6 @@ public:
         argu.grad->print(os);
         LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
       }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
     }
   }
 
@@ -957,7 +1049,7 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  *
  * The config file api is maxid_printer_evaluator.
  */
-class MaxIdPrinter : public Evaluator {
+class MaxIdPrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -999,7 +1091,7 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  *
  * The config file api is maxframe_printer_evaluator.
  */
-class MaxFramePrinter : public Evaluator {
+class MaxFramePrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -1086,7 +1178,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  * The config file api is seqtext_printer_evaluator.
  *
  */
-class SequenceTextPrinter : public Evaluator {
+class SequenceTextPrinter : public NotGetableEvaluator {
 private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
@@ -1253,4 +1345,6 @@ public:
 };
 REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
 
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
 }  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index 5770847309670ef1856cfb9255fa847c24513b56..b114500e2b7c1e460a02c78b99b5f1a8fb63b8c3 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"
 #include "paddle/pserver/ParameterClient2.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -117,12 +118,105 @@ public:
 
   static ClassRegistrar<Evaluator> registrar_;
 
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
 protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
 };
 
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
 class DummyEvaluator : public Evaluator {
 public:
   DummyEvaluator() {}
@@ -135,6 +229,10 @@ public:
   }
   virtual void finish() {}
   virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -191,6 +289,11 @@ private:
   }
 
   double calcAuc() const;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
 };
 
 /**
@@ -223,6 +326,10 @@ private:
                      real* clickData,
                      real* pvData,
                      size_t size);
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief precision, recall and f1 score Evaluator
@@ -272,6 +379,20 @@ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
   void calcStatsInfo(const MatrixPtr& output,
                      const IVectorPtr& label,
                      const MatrixPtr& weight);
@@ -303,6 +424,15 @@ private:
       return 0;
     }
   }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names);
+  real getValue(const std::string& name, Error* err) const;
+  std::string getType(const std::string& name, Error* err) const;
 };
 
 /*
@@ -349,8 +479,7 @@ public:
   virtual void finish() { calc(predictArray_); }
 
   virtual void printStats(std::ostream& os) const {
-    os << " pos/neg"
-       << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+    os << " pos/neg=" << this->getValueImpl();
   }
 
   virtual void distributeEval(ParameterClient2* client) {
@@ -366,6 +495,13 @@ private:
   IVectorPtr cpuLabel_;
   IVectorPtr cpuInfo_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 36ca05b919b136c162105cf4f1fb7705ae7ca7f3..b44e4dc202f01956ed21c175aa897ced8e92546b 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "MultiGradientMachine.h"
 #include "MultiNetwork.h"
 #include "NeuralNetwork.h"
-#include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 #include "hl_gpu.h"
 
@@ -60,55 +59,6 @@ GradientMachine* GradientMachine::create(
   return nullptr;
 }
 
-GradientMachine* GradientMachine::create(const std::string& modelFile,
-                                         DataConfig* dataConfig) {
-  std::ifstream is(modelFile);
-  CHECK(is) << "Fail to open " << modelFile;
-  return create(is, dataConfig);
-}
-
-GradientMachine* GradientMachine::create(std::istream& is,
-                                         DataConfig* dataConfig) {
-  TrainerConfig trainerConfig;
-  GradientMachine* ret = create(is, &trainerConfig);
-  if (dataConfig && trainerConfig.has_data_config()) {
-    *dataConfig = trainerConfig.data_config();
-  }
-  return ret;
-}
-
-GradientMachine* GradientMachine::create(const std::string& modelFile,
-                                         TrainerConfig* trainerConfig) {
-  std::ifstream is(modelFile);
-  CHECK(is) << "Fail to open " << modelFile;
-  return create(is, trainerConfig);
-}
-
-GradientMachine* GradientMachine::create(std::istream& is,
-                                         TrainerConfig* trainerConfig) {
-  TrainerConfig trainerConfigTemp;
-  int64_t size;
-  CHECK(is.read((char*)&size, sizeof(size))) << "Fail to read ";
-  std::string buf;
-  buf.resize(size);
-  CHECK(is.read(&buf[0], size)) << "Fail to read ";
-  CHECK(trainerConfigTemp.ParseFromString(buf)) << "Fail to parse config";
-  std::unique_ptr<GradientMachine> machine(
-      create(trainerConfigTemp.model_config()));
-  std::vector<ParameterPtr>& parameters = machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  machine->onLoadParameter();
-
-  if (trainerConfig) {
-    *trainerConfig = trainerConfigTemp;
-  }
-
-  return machine.release();
-}
-
 void GradientMachine::saveParameters(const std::string& dir) const {
   LOG(INFO) << "Saving parameters to " << dir;
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 1e35c7e2b8d185e45f33f6287ad4e32ccad2d5a6..f9c82a2bef82b4e6bcbf0c73583505d2692f3926 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -89,39 +89,6 @@ public:
           std::vector<ParameterType>{
               PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
 
-  /**
-   * Create a gradient machine from the merged model file.
-   * The merged model file can be generated using tools/merge_model
-   * If dataConfig is not null, it will be filled with the DataConfig
-   * from the TrainerConfig
-   */
-  static GradientMachine* create(const std::string& modelFile,
-                                 DataConfig* dataConfig);
-
-  /**
-   * Create a gradient machine from a stream which contains the merged
-   * model file. The merged model file can be generated using tools/merge_model
-   * If dataConfig is not null, it will be filled with the DataConfig
-   * from the TrainerConfig
-   */
-  static GradientMachine* create(std::istream& is, DataConfig* dataConfig);
-
-  /**
-   * Create a gradient machine from the merged model file.
-   * The merged model file can be generated using tools/merge_model
-   * If trainerConfig is not null, it will be filled with the TrainerConfig
-   */
-  static GradientMachine* create(const std::string& modelFile,
-                                 TrainerConfig* trainerConfig);
-
-  /**
-   * Create a gradient machine from a stream which contains the merged
-   * model file. The merged model file can be generated using tools/merge_model
-   * If trainerConfig is not null, it will be filled with the TrainerConfig
-   */
-  static GradientMachine* create(std::istream& is,
-                                 TrainerConfig* trainerConfig);
-
   virtual ~GradientMachine() {}
 
   /**
@@ -167,6 +134,8 @@ public:
     backward(callback);
   }
 
+  virtual Argument getLayerOutput(const std::string& layerName) = 0;
+
   // see comment in Layer.h for the function with the same name
   virtual void resetState() {}
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 80f223824d8dccfb0e9386f4c076b28f9332a958..3159026e6b92355ba7480b09535388c969a504e2 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -24,9 +24,6 @@ limitations under the License. */
 DEFINE_bool(allow_only_one_model_on_one_gpu,
             true,
             "If true, do not allow multiple models on one GPU device");
-#ifdef PADDLE_METRIC_LEARNING
-DECLARE_bool(external);
-#endif
 
 namespace paddle {
 
@@ -45,11 +42,7 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
       inArgsCopied_(false) {
-#ifdef PADDLE_METRIC_LEARNING
-  isPassGrad_ = FLAGS_external;
-#else
   isPassGrad_ = false;
-#endif
   numThreads_ = FLAGS_trainer_count;
   if (useGpu) {
     //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
@@ -282,6 +275,18 @@ void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
   backwardImp(callback);
 }
 
+Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
+  std::vector<Argument> args;
+  args.reserve(threads_.size());
+
+  for (auto& thread : threads_) {
+    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
+  }
+  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
+
+  return outLayerArgs_;
+}
+
 void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
@@ -334,7 +339,9 @@ Evaluator* MultiGradientMachine::makeEvaluator() const {
 void MultiGradientMachine::eval(Evaluator* evaluator) const {
   for (auto& thread : threads_) {
     SetDevice device(thread->getDeviceId());
-    thread->getGradientMachine()->eval(evaluator);
+    if (thread->hasInputData()) {
+      thread->getGradientMachine()->eval(evaluator);
+    }
   }
 }
 
@@ -344,14 +351,19 @@ void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
     REGISTER_TIMER("waitOutArgs");
     thread->waitOutArgsReady();
   }
-  outArgs_.resize(threads_[0]->getOutArgs().size());
+
+  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
 
   REGISTER_TIMER("copyOutArgs");
   for (size_t i = 0; i < outArgs_.size(); ++i) {
     std::vector<Argument> args;
     args.reserve(threads_.size());
     for (auto& thread : threads_) {
-      args.push_back(thread->getOutArgs()[i]);
+      // If the thread input is empty, then the output is empty.
+      auto tmp = thread->getOutArgs();
+      if (tmp.size() > 0) {
+        args.push_back(tmp[i]);
+      }
     }
     outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
   }
@@ -506,7 +518,7 @@ void TrainerThread::computeThread() {
         backward();
         break;
       case MultiGradientMachine::TASK_COPY_IN_ARGS:
-        copyInArgs();
+        batchSize_ = copyInArgs();
         inArgsCopied_ = true;
         multiMachine_->waitForCopyInArgs();
         break;
@@ -522,7 +534,7 @@ void TrainerThread::prefetch() {
 void TrainerThread::forward() {
   if (!inArgsCopied_) {
     REGISTER_TIMER("copyInArgs");
-    copyInArgs();
+    batchSize_ = copyInArgs();
   } else {
     inArgsCopied_ = false;
   }
@@ -552,7 +564,12 @@ void TrainerThread::forward() {
 
   {
     REGISTER_TIMER("thread_forward");
-    gradientMachine_->forward(inArgs_, &outArgs_, multiMachine_->getPassType());
+    if (batchSize_ > 0) {
+      gradientMachine_->forward(
+          inArgs_, &outArgs_, multiMachine_->getPassType());
+    } else {
+      outArgs_.clear();
+    }
   }
   outArgsReadySem_.post();
 }
@@ -562,7 +579,13 @@ void TrainerThread::backward() {
   if (multiMachine_->isPassGrad()) {
     copyOutputGrad();
   }
-  gradientMachine_->backward(backwardCallback_);
+  if (batchSize_ > 0) {
+    gradientMachine_->backward(backwardCallback_);
+  } else {
+    for (size_t i = parameters_.size(); i > 0; i--) {
+      backwardCallback(parameters_[i - 1].get());
+    }
+  }
   if (multiMachine_->hasNonstaticCpuParamters()) {
     mergeCpuGradients();
   }
@@ -720,7 +743,7 @@ void TrainerThread::notifyValueReady(int paramId) {
   notifyValueDispatch(paramId);
 }
 
-void TrainerThread::copyInArgs() {
+int TrainerThread::copyInArgs() {
   const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
   int numThreads = multiMachine_->getAllThreads().size();
   int32_t numSequences = fullInArgs[0].getNumSequences();
@@ -736,7 +759,7 @@ void TrainerThread::copyInArgs() {
   }
 
   if (copySize == 0) {
-    return;
+    return 0;
   }
 
   for (size_t i = 0; i < fullInArgs.size(); i++) {
@@ -746,6 +769,7 @@ void TrainerThread::copyInArgs() {
         copySize,
         FLAGS_parallel_nn ? false : multiMachine_->useGpu());
   }
+  return copySize;
 }
 
 void TrainerThread::mergeCpuGradients() {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 9be15ef4bcf34f26b7eceb9047252e537f20a4a8..70203bbb97fe79d72fbc6bd2b5d427cb1de7b61f 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -189,6 +189,8 @@ public:
                        PassType passType,
                        const UpdateCallback& callback);
 
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   virtual void onPassEnd();
 
   virtual void finish();
@@ -314,6 +316,8 @@ protected:
   std::vector<Argument> outArgs_;
   hl_stream_t outArgStream_;
 
+  Argument outLayerArgs_;
+
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
   int numDevices_;         /* number of gpu devices */
@@ -383,6 +387,9 @@ public:
   /// copy the output gradient from the main GradientMachine.
   void copyOutputGrad();
 
+  /// Whether the thread has input data.
+  bool hasInputData() { return batchSize_ != 0; }
+
 protected:
   void mergeCpuGradients();
 
@@ -403,7 +410,7 @@ protected:
   void copyGradToBufferThread();
   void gradCollectThread();
 
-  void copyInArgs();
+  int copyInArgs();
   void forward();
   void backward();
   void backwardCallback(Parameter* para);
@@ -463,6 +470,7 @@ protected:
 
   /// indicate whether inArgs is copied before forward()
   bool inArgsCopied_;
+  int batchSize_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 22051e07ee0026bc3c44a8767e265a56b415b8e4..4512aacc81f86bf87fc9ea30adcf081327663f16 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -293,11 +293,10 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
-MatrixPtr NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  auto it = layerMap_.find(layerName);
-  CHECK(it != layerMap_.end()) << "Cannot find layer: " << layerName;
-  return it->second->getOutputValue();
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
 }
+
 void NeuralNetwork::onPassEnd() {
   for (auto& layer : layers_) {
     layer->onPassEnd();
@@ -306,7 +305,6 @@ void NeuralNetwork::onPassEnd() {
 
 class CombinedEvaluator : public Evaluator {
 public:
-  CombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -346,6 +344,55 @@ public:
 
 protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
 };
 
 Evaluator* NeuralNetwork::makeEvaluator() const {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 25af4abcf81700e200feea806fa3daed19df1275..e7b6c438407e7eab6eab1f6ed496f35caa9f2177 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -87,7 +87,8 @@ public:
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  MatrixPtr getLayerOutput(const std::string& layerName);
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   const LayerPtr& getLayer(const std::string& layerName) const {
     auto it = layerMap_.find(layerName);
     CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index a9a9f4f903e305bfe0ee3dd089a85ba524022faa..01158d1dce8d711c67b1ecf29bb644e42ccf6ff5 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -155,7 +155,8 @@ protected:
 public:
   explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     if (!Layer::init(layerMap, parameterMap)) return false;
 
     if (biasParameter_) {
@@ -174,7 +175,7 @@ public:
     }
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     if (biases_) {
       MatrixPtr outV = getOutputValue();
       outV->addBias(*(biases_->getW()), 1);
@@ -182,7 +183,7 @@ public:
     }
   }
 
-  virtual void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     if (biases_) {
       backwardActivation();
       biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
@@ -636,7 +637,7 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
 /* create scattered id infomation for all realLayer of inFrameLines one time.
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
-*/
+ */
 
 void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
                                                  const Argument& input,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 910ca4376bedeac31674c71b9ea1205ef769cda9..c2bc52709ab42bbe21dcc3951f23f2e0b5e6793d 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -107,18 +107,18 @@ public:
       DropCallback;
 
   /**
-    * @brief NormOrDropNodeCallback
-    *
-    * Normalize a path's probabilities or just drop it by modifying path.logProb
-    *
-    * The first parameter is sequence index in a batch
-    *
-    * The second parameter is path.ids
-    *
-    * The third parameter is probabilites for each node in this path.
-    *
-    * The fourth parameter is the probability of the whole path.
-    */
+   * @brief NormOrDropNodeCallback
+   *
+   * Normalize a path's probabilities or just drop it by modifying path.logProb
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is path.ids
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * The fourth parameter is the probability of the whole path.
+   */
   typedef std::function<void(
       int seqId, const std::vector<int>&, std::vector<real>&, real*)>
       NormOrDropNodeCallback;
@@ -348,9 +348,9 @@ protected:
   int targetInfoInlinkId_;
 
   /* create scattered id infomation for all realLayer of inFrameLines one time.
-  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-  *  for all realLayer of inFrameLines one time.
-  */
+   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+   *  for all realLayer of inFrameLines one time.
+   */
   void createInFrameInfo(int inlinks_id,
                          const Argument& input,
                          PassType passType);
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 53d3f99cdd3439a1ba85f54526ca65005986c634..4e98c174b462763d3c2714770f66951981afa9f8 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -44,19 +44,20 @@ public:
   /**
    * Intialization of AddtoLayer.
    */
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * Forward propagation.
    * @note There is no weight matrix for each input,
    *       because it just a simple add operation.
    */
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
   /**
    * Backward propagation.
    */
-  void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.cpp b/paddle/gserver/layers/AgentLayer.cpp
index 2d300290279d6aafc162f11dbc809537a308ca79..7b1b99b135e35e5fe41dbb3d053a96e3e31e5cf1 100644
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -42,7 +42,8 @@ void AgentLayer::forward(PassType passType) {
   // get Arguments from real layers
   if (numSamples_ > 0 && numSamples_ < realHeight) {
     if (realOutput.ids) {
-      output_.ids->subVecFrom(*realOutput.ids, 0, numSamples_);
+      output_.ids =
+          IVector::create(realOutput.ids->getData(), numSamples_, useGpu_);
     } else {
       output_.subArgFrom(
           realOutput, /* offset */ 0, numSamples_, getSize(), useGpu_);
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 41683ad6712d5df710737cf71c600790fcc8786f..b6dac7ae6fec2d61c60c9548d466233efe9febd5 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -35,7 +35,8 @@ public:
 
   ~AgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   // if *numSamples* set,
   // real layer output will only use first *numSamples* rows
@@ -44,8 +45,8 @@ public:
     numSamples_ = numSamples;
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 /**
@@ -56,8 +57,8 @@ public:
   explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
   ~SequenceAgentLayer() {}
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 /**
@@ -78,7 +79,8 @@ public:
 
   virtual ~GatherAgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   // call before addRealLayer
   void copyIdAndSequenceInfo(const Argument& input,
@@ -88,8 +90,8 @@ public:
   // add one real layer, can call many times
   void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 /**
@@ -133,7 +135,8 @@ public:
 
   virtual ~ScatterAgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * @brief set real layer in generation
@@ -182,8 +185,8 @@ public:
     numSequences_ = numSequences;
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 /**
diff --git a/paddle/gserver/layers/AverageLayer.cpp b/paddle/gserver/layers/AverageLayer.cpp
index b8955ab04f209629c855ed66f8e8e9701b7224a3..96cc4288c6faad4b80c790ed2ce6f5128ea83b6d 100644
--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -26,8 +26,6 @@ bool AverageLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
   SequencePoolLayer::init(layerMap, parameterMap);
 
-  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
-  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
   // average strategy
   if (config_.average_strategy() == "average") {
     mode_ = kAverage;
@@ -60,43 +58,9 @@ void AverageLayer::forward(PassType passType) {
 void AverageLayer::backward(const UpdateCallback& callback) {
   SequencePoolLayer::backward(callback);
 
-  const int* starts = startPositions_->getData(false);
-  MatrixPtr grad = getInputGrad(0);
-
-  if (grad) {
-    size_t dim = getSize();
-    real* gradientData = getInputGrad(0)->getData();
-    real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions_->getSize() - 1;
-    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
-      // TODO(Dangqingqing) optimization for GPU
-      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
-      if (0 == sequenceLength) {
-        // empty sequence
-        continue;
-      }
-      dataMtx_->setData(
-          gradientData + starts[sequenceId] * dim, sequenceLength, dim);
-      outMtx_->setData(gradient + sequenceId * dim);
-      switch (mode_) {
-        case kAverage: {
-          // plain average
-          dataMtx_->addBias(*outMtx_, 1.0f / sequenceLength);
-          break;
-        }
-        case kSum: {
-          // sum instead of average
-          dataMtx_->addBias(*outMtx_, 1.0f);
-          break;
-        }
-        case kAverageSquareRootN: {
-          // divide by square root of sequenceLength
-          dataMtx_->addBias(*outMtx_, 1.0f / sqrt(sequenceLength));
-          break;
-        }
-        default: { LOG(FATAL) << "should not reach here"; }
-      }
-    }
+  if (getInputGrad(0)) {
+    getInputGrad(0)->sequenceAvgBackward(
+        *getOutputGrad(), *startPositions_->getVector(useGpu_), mode_);
   }
 }
 
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index b3c4ecec8bc6f56b4563ee9f1ada91e4d8f2cbb5..332552a30479a368c24db10e5ef3a9d59408c8ef 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -38,16 +38,13 @@ public:
   explicit AverageLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
-  ~AverageLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  MatrixPtr outMtx_;
-  MatrixPtr dataMtx_;
   int mode_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 75bda95de1472b08538b48072ddf9ea607b83299..230bafc31d96bbd49481a7ed135be6888688627e 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -52,7 +52,8 @@ public:
    */
   static Layer* create(const LayerConfig& config);
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * @brief Calculate feature map size. Some input uses frameHeight and
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 195acbbfc58db8368f6db1c1595dd6b04801ee26..f6115801fc6b341c0718f8851617de43bdeeec09 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -33,9 +33,10 @@ public:
 
   ~BatchNormalizationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   /// Epsilon value used in the batch normalization formula.
@@ -58,7 +59,7 @@ protected:
   /// to batch, channels* imagePixels.
   void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
 
-  void onPassEnd() { firstTest_ = true; }
+  void onPassEnd() override { firstTest_ = true; }
 
   MatrixPtr tmpMat_, tmpGrad_;
   MatrixPtr expandedIn_, expandedOut_;
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index 4ff4b0ea793dc901d099bf73d55aa15463e62094..27c269f2781c99e4f166ef1052cbf03a773ad57e 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -38,9 +38,10 @@ public:
   virtual ~BilinearInterpLayer() {}
 
   size_t getSize();
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index cc96fdd03fcac6925a16f0fb91045f065f74e803..8f347400e60ec84fc1b5fdbc1c911a8768b306d0 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -58,10 +58,11 @@ public:
 
   ~BlockExpandLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index fdb46aba68e924480a6595b02c04ff4d1edd914d..191176ce985a8e12e33562f0cab73da6bbe667e6 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
     return false;
   }
   crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr));
+      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
   return true;
 }
 
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 1fd444ad10e71df2bb6d8bdb839e6f02b33d647f..3cbcac6cf62decd43844cc442fc5e4f973d0acfc 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -32,9 +32,10 @@ namespace paddle {
 class CRFDecodingLayer : public CRFLayer {
 public:
   explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   std::unique_ptr<LinearChainCRF> crf_;
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index 02b7aaf17e89d889ca0030f9de2b5d7431a28fd3..0b544420097e9150f8489731b6379dea633e992c 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
   CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
 
   parameter_ = parameters_[0];
+  weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
 
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
@@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_,
-                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
-                         parameter_->getBuf(PARAMETER_GRADIENT)
-                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                             : nullptr);
+      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
     }
     output_.value->getData()[i] =
         crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
@@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) {
   const int* starts = label.sequenceStartPositions->getData(false);
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
+  bool needWGrad = weight_->getWGrad() ? true : false;
   for (int i = 0; i < numSequences; ++i) {
     crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      output.grad->getData() + numClasses_ * starts[i],
                       label.ids->getData() + starts[i],
-                      starts[i + 1] - starts[i]);
-    if (weightLayer_) {
-      real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-      grad->mulScalar(weight);
+                      starts[i + 1] - starts[i],
+                      needWGrad);
+    real instanceWeight = weightLayer_
+                              ? getInputValue(*weightLayer_)->getElement(i, 0)
+                              : real(1.0f);
+    instanceWeight *= coeff_;
+
+    MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+    grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    if (needWGrad) {
+      weight_->getWGrad()->add(
+          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
     }
   }
 
-  if (coeff_ != real(1.0f)) {
-    output.grad->mulScalar(coeff_);
-  }
-
   parameter_->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index d21b32b68c1a40c814af3aa2c285612a5f938d79..00ec13cede97401b4c8a308df6fac27e47692146 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -29,16 +29,18 @@ namespace paddle {
 class CRFLayer : public Layer {
 public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
-  LayerPtr weightLayer_;  // weight for each sequence
-  real coeff_;            // weight for the layer
+  LayerPtr weightLayer_;            // weight for each sequence
+  std::unique_ptr<Weight> weight_;  // parameters
+  real coeff_;                      // weight for the layer
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 70d429bad656ade3c05256472d799ae72e128be5..f7a515f312d075c54b4aab2557175c70fdbd9875 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -22,10 +22,11 @@ namespace paddle {
 class CTCLayer : public Layer {
 public:
   explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
   void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
-  virtual void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
   void backwardImp(const UpdateCallback& callback,
                    const Argument& softmaxSeqs,
                    const Argument& labelSeqs);
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index d19adace7d58af16736fc2b6e536f5fd69a19863..c5fc4cf4f81a55a4c57e92dce64c06acd404badd 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -28,10 +28,11 @@ public:
 
   ~ConcatenateLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(concat, ConcatenateLayer);
@@ -101,10 +102,11 @@ public:
 
   ~ConcatenateLayer2() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   std::vector<std::unique_ptr<Projection>> projections_;
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index ee4db219890a135d786c46827632d02d1db5b760..d7042af1c25e7432e5b1efbb89cd8fd3f63fb4ae 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -110,24 +110,24 @@ void ContextProjection::forward() {
   size_t input_dim = in_->value->getWidth();
   size_t dim = out_->value->getWidth();
   CHECK_EQ(dim, input_dim * config_.context_length());
-  size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(static_cast<int>(forward_.size()), 1)
-      << "Only one forward function here";
+  // size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
   bool is_padding = config_.trainable_padding();
   /// first use state_, otherwise use weight_(padding false === w nullptr)
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  auto start_pos = in_->sequenceStartPositions;
-  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
-                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
-                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-                     Tensor(reinterpret_cast<real*>(
-                                const_cast<int*>(start_pos->getData(useGpu_))),
-                            Dims{start_pos->getSize()})},
-                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
-                    {});
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -162,15 +162,20 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   bool is_padding = config_.trainable_padding();
   auto start_pos = in_->sequenceStartPositions;
   auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
-                             Dims{batch_size, input_dim}),
-                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
-                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-                      Tensor(reinterpret_cast<real*>(
-                                 const_cast<int*>(start_pos->getData(useGpu_))),
-                             Dims{start_pos->getSize()})},
-                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
-                     {});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index aedf4100e32fa1294c361b6163c14eab7869b803..e9d15d94f806a5d2e6f11cbbfc29e291dfe8538f 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -80,7 +80,8 @@ protected:
 public:
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c231986292d2cd26ee30ccc122142fccd5b4949
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d42169cde2a80a26edcf98bc2d728e00b075728
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseOperator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvBaseOperator : public Operator {
+public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+
+protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1e932ded595c90cbe6040c330c5c8663d81e2b4
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseProjection.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+
+ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
+                                       ParameterPtr parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvBaseProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  configImgW_ = conf.img_size();
+
+  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  configOutW_ = conf.output_x();
+
+  configChannels_ = conf.channels();
+  configNumFilters_ = config_.num_filters();
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+
+  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
+  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvBaseProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
+  // The stride between two consecutive samples in the output of ConvProjection
+  // may not be numFilters_ * outputH_ * outputW_ (conv) or
+  // channels_ * imageH_ * imageW_ (deconv)
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  size_t nStrideImage, nStrideOutput;
+  if (isDeconv_) {
+    nStrideImage = out_->value->getStride();
+    nStrideOutput = numFilters_ * outputH_ * outputW_;
+  } else {
+    nStrideImage = channels_ * imageH_ * imageW_;
+    nStrideOutput = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(imageDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    nStrideImage,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStrideOutput,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
+}
+
+void ConvBaseProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(calInputSize(), in_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(imageDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void *ConvBaseProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle **localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvBaseProjection::~ConvBaseProjection() {
+  hl_destroy_tensor_descriptor(imageDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a33aa1837dfc36dbead60deaccbc6b772fe4754
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+
+  ~ConvBaseProjection();
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+
+  static void* getSpaceBytes(size_t size);
+
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index f943410dee0dc2f3d356c9d7d8f61398fe2871c8..80932c8c509e3cb013c7e0051cbf4d8ccced0228 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Operator.h"
+#include "ConvOperator.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 
@@ -27,120 +27,8 @@ namespace paddle {
  * The config file api is conv_operator.
  */
 
-class ConvOperator : public Operator {
-public:
-  ConvOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(inputDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-  virtual void forward();
-  virtual void backward();
-
-private:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshape(int batchSize);
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor inputDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
 REGISTER_OPERATOR(conv, ConvOperator);
 
-ConvOperator::ConvOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
 void ConvOperator::reshape(int batchSize) {
   imageH_ = ins_[0]->getFrameHeight();
   imageW_ = ins_[0]->getFrameWidth();
@@ -148,106 +36,25 @@ void ConvOperator::reshape(int batchSize) {
   if (imageW_ == 0) imageW_ = imgSize_;
   outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
   out_->setFrameHeight(outputH_);
   out_->setFrameWidth(outputW_);
 
   reshapeImageDescriptors();
 
-  if (!isSelectAlgo_) {
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
 
-    allocConvWorkSpace(maxWorkSpace);
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
   }
 
   isSelectAlgo_ = true;
 }
 
-void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  int outputX =
-      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
-  int outputY =
-      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  CHECK_EQ(outputX, outputX_);
-  CHECK_EQ(outputY, outputY_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
-}
-
-void ConvOperator::getConvParams() {
-  numFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  channels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-}
-
 void ConvOperator::forward() {
   size_t batchSize = ins_[0]->value->getHeight();
   reshape(batchSize);
@@ -264,7 +71,7 @@ void ConvOperator::forward() {
       real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
       real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
       real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_,
+      hl_convolution_forward(imageDesc_,
                              inputData,
                              outputDesc_,
                              outData,
@@ -287,7 +94,7 @@ void ConvOperator::backward() {
       if (ins_[1]->grad) {
         real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
         real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_,
+        hl_convolution_backward_filter(imageDesc_,
                                        inputData,
                                        outputDesc_,
                                        outGrad,
@@ -303,7 +110,7 @@ void ConvOperator::backward() {
       if (NULL != preGrad) {
         real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
         real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(inputDesc_,
+        hl_convolution_backward_data(imageDesc_,
                                      inputGrad,
                                      outputDesc_,
                                      outGrad,
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f3546c67ac174628044d5fb6e5c7bce06f37995
--- /dev/null
+++ b/paddle/gserver/layers/ConvOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public ConvBaseOperator {
+public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 0281170bc59855f6f4d2f4212523275a92d202d5..5b7ecc5560c1e7431305b34a331fe1fbc96c6b06 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -19,149 +19,32 @@ namespace paddle {
 
 REGISTER_PROJECTION(conv, ConvProjection);
 
-ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
-
-ConvProjection::ConvProjection(const ProjectionConfig &config,
-                               ParameterPtr parameter,
-                               bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  channels_ = conf.channels();
-  numFilters_ = config_.num_filters();
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
-}
-
-void ConvProjection::reshapeTensorDesc(int batchSize) {
-  hl_tensor_reshape(inputDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_);
-
-  // The stride between two consecutive images in ConvProjection may not be 1,
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  // In fact, only "nStride = out_->value->getStride()" is ok.
-  size_t nStride = numFilters_ * outputH_ * outputW_;
-  if (out_->value->isContiguous()) {
-    CHECK_EQ(nStride, out_->value->getWidth());
-  } else {
-    nStride = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStride,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
+size_t ConvProjection::calOutputSize() {
+  imageH_ = in_->getFrameHeight();
+  imageW_ = in_->getFrameWidth();
+  if (imageH_ == 0) imageH_ = configImgH_;
+  if (imageW_ == 0) imageW_ = configImgW_;
+  outputH_ = outputSize(imageH_,
+                        filterH_,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ true);
+  outputW_ = outputSize(imageW_,
+                        filterW_,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
+  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
+
+  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
+  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
+  return outputH_ * outputW_ * configNumFilters_;
 }
 
-void ConvProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
-           in_->value->getWidth())
-      << "Wrong input size for convolution"
-      << " channels=" << channels_ << " imageH=" << imageH_
-      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
-
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+size_t ConvProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
 }
 
 void ConvProjection::forward() {
@@ -179,7 +62,7 @@ void ConvProjection::forward() {
     real *inputData = in_->value->getData() + g * inputOffset_;
     real *wgtData = weight_->getW()->getData() + g * weightOffset_;
     real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(inputDesc_,
+    hl_convolution_forward(imageDesc_,
                            inputData,
                            outputDesc_,
                            outData,
@@ -205,7 +88,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
     if (weight_->getWGrad()) {
       real *inputData = in_->value->getData() + g * inputOffset_;
       real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(inputDesc_,
+      hl_convolution_backward_filter(imageDesc_,
                                      inputData,
                                      outputDesc_,
                                      outGrad,
@@ -221,7 +104,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
     if (NULL != preGrad) {
       real *inputGrad = preGrad->getData() + g * inputOffset_;
       real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(inputDesc_,
+      hl_convolution_backward_data(imageDesc_,
                                    inputGrad,
                                    outputDesc_,
                                    outGrad,
@@ -237,26 +120,4 @@ void ConvProjection::backward(const UpdateCallback &callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void *ConvProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
-  }
-  return (*localMem)->getBuf();
-}
-
-ConvProjection::~ConvProjection() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index c32e5e1d3ab2f85feb6dd2fb5fbddd7482598e58..b7d7cc9a275529a02a5d8e82d28ed79cb7ce0b43 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "Projection.h"
+#include "ConvBaseProjection.h"
 #include "paddle/math/MathUtils.h"
 
 namespace paddle {
@@ -22,109 +22,22 @@ namespace paddle {
 /**
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
-class ConvProjection : public Projection {
+class ConvProjection : public ConvBaseProjection {
 public:
   /**
    * Constructor.
    */
   ConvProjection(const ProjectionConfig& config,
                  ParameterPtr parameter,
-                 bool useGpu);
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
 
-  ~ConvProjection();
+  ~ConvProjection() {}
 
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
-
-protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  size_t calOutputSize() {
-    imageH_ = in_->getFrameHeight();
-    imageW_ = in_->getFrameWidth();
-    if (imageH_ == 0) imageH_ = configImgH_;
-    if (imageW_ == 0) imageW_ = configImgW_;
-    outputH_ = outputSize(imageH_,
-                          filterH_,
-                          paddingH_,
-                          strideH_,
-                          /* caffeMode */ true);
-    outputW_ = outputSize(imageW_,
-                          filterW_,
-                          paddingW_,
-                          strideW_,
-                          /* caffeMode */ true);
-
-    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
-    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
-
-    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
-    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
-    return outputH_ * outputW_ * numFilters_;
-  }
-
-  static void* getSpaceBytes(size_t size);
-
-  /// imageH_ and imageW_ is calculated from the input layer.
-  int imageH_, imageW_;
-  /// configImgH_ and configImgW_ is obtained from config.
-  int configImgH_, configImgW_;
-  int outputH_, outputW_;
-  int channels_, numFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 9bfb1ab7a47b11a6793159aefcb4f9fa12b81a6b..002be415691f0b3df93835915dcbc9d455231422 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -47,10 +47,11 @@ public:
 
   ~ConvShiftLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(conv_shift, ConvShiftLayer);
diff --git a/paddle/gserver/layers/ConvTransOperator.cpp b/paddle/gserver/layers/ConvTransOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db026337a473f7edf1a7c0db320f60ff3048eb9c
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransOperator.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(convt, ConvTransOperator);
+
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca08dc9aa77d59b45635c16cdd5064c5c3b5f96d
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvTransOperator : public ConvBaseOperator {
+public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48132a3ce4cc4b50fea6d755d84d7254d2055bec
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransProjection.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(convt, ConvTransProjection);
+size_t ConvTransProjection::calOutputSize() {
+  outputH_ = in_->getFrameHeight();
+  outputW_ = in_->getFrameWidth();
+  if (outputH_ == 0) outputH_ = configOutH_;
+  if (outputW_ == 0) outputW_ = configOutW_;
+  imageH_ = imageSize(outputH_,
+                      filterH_,
+                      paddingH_,
+                      strideH_,
+                      /* caffeMode */ true);
+
+  imageW_ = imageSize(outputW_,
+                      filterW_,
+                      paddingW_,
+                      strideW_,
+                      /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
+  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
+
+  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
+  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
+  return imageH_ * imageW_ * configNumFilters_;
+}
+
+size_t ConvTransProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
+}
+
+void ConvTransProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
+
+    real *inData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_backward_data(imageDesc_,
+                                 outData,
+                                 outputDesc_,
+                                 inData,
+                                 filterDesc_,
+                                 wgtData,
+                                 convDesc_,
+                                 workSpace,
+                                 bwdDataLimitBytes_,
+                                 bwdDataAlgo_);
+  }
+}
+
+void ConvTransProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     outGrad,
+                                     outputDesc_,
+                                     inData,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_forward(imageDesc_,
+                             outGrad,
+                             outputDesc_,
+                             inGrad,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace,
+                             fwdLimitBytes_,
+                             fwdAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..6508d17b2409aa0cc11cdafb306604816f010718
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvTransProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index ed57f2af3c6455fb89fd05b37bb205e8da0bf7e1..32eb3bf604acaa8f2060882b545efeeb40f8218d 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -49,10 +49,11 @@ public:
 
   ~ConvexCombinationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 254120443dc3d41bf2422be2e88cb376d70c93d4..57ba124e40cbd098fa8b0012ff31d6935b16862a 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -26,15 +26,23 @@ bool CosSimLayer::init(const LayerMap& layerMap,
   Layer::init(layerMap, parameterMap);
 
   CHECK_EQ(inputLayers_.size(), 2LU);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
   return true;
 }
 
 void CosSimLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   {
     REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
@@ -42,26 +50,43 @@ void CosSimLayer::forward(PassType passType) {
   }
 
   MatrixPtr outV = getOutputValue();
-
   /* activation */ {
     REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
     MatrixPtr prevOut1 = getInputValue(0);
     MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
+
+    CHECK(outV && prevOut1 && prevOut2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*prevOut1);
+    inputs.addArg(*prevOut2);
+    outputs.addArg(*outV, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
   }
 }
 
 void CosSimLayer::backward(const UpdateCallback& callback) {
   /* activation */ {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    MatrixPtr outG = this->getOutputGrad();
-
-    outG->cosSimDerivative(*this->getOutputValue(),
-                           *getInputValue(0),
-                           *getInputValue(1),
-                           *getInputGrad(0),
-                           *getInputGrad(1),
-                           config_.cos_scale());
+    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
+
+    const auto outG = this->getOutputGrad();
+    const auto outV = this->getOutputValue();
+    const auto inV1 = this->getInputValue(0);
+    const auto inV2 = this->getInputValue(1);
+    auto inG1 = this->getInputGrad(0);
+    auto inG2 = this->getInputGrad(1);
+    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*outG);
+    inputs.addArg(*outV);
+    inputs.addArg(*inV1);
+    inputs.addArg(*inV2);
+    outputs.addArg(*inG1, ADD_TO);
+    outputs.addArg(*inG2, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 5dcc5d8a5b4dc76cb6cea023a874049731a26516..8afaee62c2dcacba006846df0111fcbe8f7575e4 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
  *
  * - Input1: A vector (batchSize * dataDim) *
  * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (dataDim * 1)
+ * - Output: A vector (batchSize * 1)
  *
  * The config file api is cos_sim.
  */
@@ -38,10 +38,11 @@ public:
 
   ~CosSimLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index ad490b0b8c4656c1eabf519233f2386b4b6e9417..0f887d8adfa053e8fe88ac4fa4e2a9ba08ac07b5 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
-
 /**
  * @brief A layer for computing cosine similarity between a vector
  * and each row of a matrix
@@ -46,10 +45,11 @@ public:
 
   ~CosSimVecMatLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
@@ -97,11 +97,22 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
                            dataDim,
                            /* trans= */ false,
                            useGpu_);
+
+  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
   return true;
 }
 
 void CosSimVecMatLayer::forward(PassType passType) {
   Layer::forward(passType);
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
@@ -117,17 +128,25 @@ void CosSimVecMatLayer::forward(PassType passType) {
   }
 
   MatrixPtr outV = getOutputValue();
-
+  CHECK(outV && inV0 && inV1);
   REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
   for (size_t i = 0; i < batchSize; i++) {
     tmpRow0->setData(inV0->rowBuf(i));
     tmpMtx0->setData(inV1->rowBuf(i));
     tmpRow2->setData(outV->rowBuf(i));
-    tmpRow2->cosSim(*(tmpMtx0), *(tmpRow0), config_.cos_scale());
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpRow2, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
   }
 }
 
 void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
+
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
   MatrixPtr inG0 = getInputGrad(0);
@@ -136,27 +155,27 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
   MatrixPtr outG = getOutputGrad();
 
   size_t batchSize = inV0->getHeight();
-
+  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
   REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
 
-  if (inG0 && inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->rowBuf(i));
-      tmpRow1->setData(inG0->rowBuf(i));
-      tmpMtx0->setData(inV1->rowBuf(i));
-      tmpMtx1->setData(inG1->rowBuf(i));
-      tmpRow2->setData(outV->rowBuf(i));
-      tmpRow3->setData(outG->rowBuf(i));
-
-      tmpRow3->cosSimDerivative(*(tmpRow2),
-                                *(tmpMtx0),
-                                *(tmpRow0),
-                                *(tmpMtx1),
-                                *(tmpRow1),
-                                config_.cos_scale());
-    }
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpRow1->setData(inG0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpMtx1->setData(inG1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow3->setData(outG->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpRow3);
+    inputs.addArg(*tmpRow2);
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpMtx1, ADD_TO);
+    outputs.addArg(*tmpRow1, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 7e9519f6b3af50bf47b660b285c3593087f80271..69d5830dd2a1afb93948eacec1cb4309cf8c6109 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -192,6 +192,59 @@ void SumOfSquaresCostLayer::backwardImp(Matrix& output,
   outputG.sumOfSquaresBp(output, *label.value);
 }
 
+//
+// class SmoothL1CostLayer
+//
+
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *labelCpu);
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value);
+  }
+}
+
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value);
+  }
+}
+
 //
 // class RankingCost
 //
@@ -367,8 +420,6 @@ void LambdaCost::backward(const UpdateCallback& callback) {
   getInputGrad(0)->add(*marginGrad_);
 }
 
-void LambdaCost::onPassEnd() {}
-
 void LambdaCost::calcGrad(const real* outputScore,
                           const real* score,
                           real* gradData,
@@ -611,14 +662,15 @@ class SumCostLayer : public Layer {
 public:
   explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     if (!ret) return ret;
     CHECK_EQ(inputLayers_.size(), 1UL);
     return true;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     const MatrixPtr& input = getInputValue(0);
 
@@ -629,7 +681,7 @@ public:
     output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
   }
 
-  virtual void backward(const UpdateCallback& callback = nullptr) {
+  void backward(const UpdateCallback& callback = nullptr) override {
     getInputGrad(0)->add((real)1);
   }
 };
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 7f73bdb3f7d63ef1c8d76deb64f40d19d20f87c7..14c0b33ec1a628521ae2d694dda8da553c29fd38 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -32,15 +32,16 @@ class CostLayer : public Layer {
 public:
   explicit CostLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
   LayerPtr getLabelLayer() { return inputLayers_[1]; }
 
-  virtual void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   virtual void forwardImp(Matrix& outputValue,
                           Argument& label,
@@ -68,11 +69,14 @@ public:
   explicit MultiClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -95,11 +99,14 @@ public:
   explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
 protected:
   MatrixPtr sftMaxSum_;
@@ -117,11 +124,14 @@ public:
   explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
 protected:
   MatrixPtr targetPerDim_;
@@ -139,11 +149,39 @@ public:
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
+/**
+ * This cost layer compute smooth L1 loss for real-valued regression
+ * tasks.
+ * \f[
+ * L =
+ *   0.5 * x^2    if / -1 < |x| < 1 /
+ *   |x| - 0.5    / otherwise /
+ * \f]
+ *
+ * x = output - label
+ */
+class SmoothL1CostLayer : public CostLayer {
+public:
+  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -162,17 +200,18 @@ class RankingCost : public Layer {
 public:
   explicit RankingCost(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
 
   LayerPtr getLabelLayer() { return inputLayers_[2]; }
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
   void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
     (void)output;
@@ -214,17 +253,16 @@ class LambdaCost : public Layer {
 public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
   LayerPtr getScoreLayer() { return inputLayers_[1]; }
 
-  void forward(PassType passType);
-
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
 
-  void onPassEnd();
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   real calcNDCG(const real* outputScore, const real* score, int size);
   void calcGrad(const real* outputScore,
@@ -256,11 +294,14 @@ public:
   explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -282,13 +323,16 @@ class HuberTwoClass : public CostLayer {
 public:
   explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
   void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
   void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fbccc11032caa4878ce8dcfe7c34a261acee68b
--- /dev/null
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+  normBuffer_->zeroMem();
+  // add eps to avoid overflow
+  normBuffer_->addScalar(*normBuffer_, 1e-6);
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    // scale the grad
+    inGTmp->copyFrom(*inVTmp);
+    inGTmp->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGTmp->divRowVector(*spatialBuffer_);
+    // subtract
+    inGTmp->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGTmp->divRowVector(*normTmp);
+    // scale the diff
+    inGTmp->mulColVector(*scale_->getW());
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index b1e7d2082f1443313bfc858a17adfd737ecff98f..413efd4d3ecd734b343efbcf8328ac0592daddda 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -35,14 +35,15 @@ public:
 
   ~CudnnBatchNormLayer();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
   /**
    * reshape tensor of ioDesc_.
    */
   void reshape(int batchSize);
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
similarity index 66%
rename from paddle/gserver/layers/CudnnConvLayer.cpp
rename to paddle/gserver/layers/CudnnConvBaseLayer.cpp
index 978c2c1479c64ab2cdebaaff7394059b3d033ab6..24363bb8b09cc354c25abe512257be68566c10e1 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "CudnnConvLayer.h"
+#include "CudnnConvBaseLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
+REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
+REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
 
-REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
-
-bool CudnnConvLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
+bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
   if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
   CHECK(useGpu_) << "CudnnConvLayer only support gpu";
 
@@ -33,7 +33,11 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   CHECK(config_.shared_biases());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     ProjectionConfig *conf = new ProjectionConfig();
-    conf->set_type("conv");
+    if (isDeconv_) {
+      conf->set_type("convt");
+    } else {
+      conf->set_type("conv");
+    }
     conf->set_num_filters(numFilters_);
     ConvConfig *convConf = conf->mutable_conv_conf();
     *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
@@ -47,14 +51,13 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
     hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
-    biasOffset_ = numFilters_ / groups_[0];
+    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
   }
 
   return true;
 }
 
-void CudnnConvLayer::forward(PassType passType) {
+void CudnnConvBaseLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   int batchSize = getInput(0).getBatchSize();
@@ -67,37 +70,41 @@ void CudnnConvLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    int outH, outW;
+    if (isDeconv_) {
+      outH = imgSizeH_[0];
+      outW = imgSizeW_[0];
+    } else {
+      outH = outputH_[0];
+      outW = outputW_[0];
+    }
+
     hl_tensor_reshape(outputDesc_,
                       batchSize,
-                      numFilters_ / groups_[0],
-                      outputH_[0],
-                      outputW_[0],
-                      numFilters_ * outputH_[0] * outputW_[0],
-                      outputH_[0] * outputW_[0],
-                      outputW_[0],
+                      numFilters_,
+                      outH,
+                      outW,
+                      numFilters_ * outH * outW,
+                      outH * outW,
+                      outW,
                       1);
-    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_ * g;
-      hl_convolution_forward_add_bias(
-          biasDesc_, biasData, outputDesc_, outData);
-    }
+    real *outData = getOutputValue()->getData();
+    real *biasData = biases_->getW()->getData();
+    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
   }
 
   forwardActivation();
 }
 
-void CudnnConvLayer::backward(const UpdateCallback &callback) {
+void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-    }
+    real *biasGrad = biases_->getWGrad()->getData();
+    real *outGrad = getOutputGrad()->getData();
+    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
@@ -106,7 +113,7 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
   }
 }
 
-CudnnConvLayer::~CudnnConvLayer() {
+CudnnConvBaseLayer::~CudnnConvBaseLayer() {
   if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
     hl_destroy_tensor_descriptor(outputDesc_);
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
similarity index 77%
rename from paddle/gserver/layers/CudnnConvLayer.h
rename to paddle/gserver/layers/CudnnConvBaseLayer.h
index b869c695bd753076c6501a1253fcad22139ccadf..93a05f94c7717f9170818b9d5ce3d27a6d18cef5 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.h
@@ -30,26 +30,24 @@ namespace paddle {
  *
  * The config file api is img_conv_layer.
  */
-class CudnnConvLayer : public ConvBaseLayer {
+class CudnnConvBaseLayer : public ConvBaseLayer {
 protected:
   std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
   std::vector<std::unique_ptr<Projection>> projections_;
 
   hl_tensor_descriptor biasDesc_;
   hl_tensor_descriptor outputDesc_;
-  int biasOffset_;
-  int outputOffset_;
 
 public:
-  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
 
-  ~CudnnConvLayer();
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
-  void addBiases();
-  void bpropBiases();
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 072b2f9513f4ef8aed03ecfa7a9014667bb2ce9e..f0aa22fe3af90c9233330c15fc56c3696a624446 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -45,7 +45,8 @@ public:
                         hl_pooling_mode_t* mode = nullptr);
   explicit CudnnPoolLayer(const LayerConfig& config);
   ~CudnnPoolLayer();
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * Reshape input and output tensor descriptor.
@@ -53,8 +54,8 @@ public:
    * So reshaping is needed.
    */
   void reshape(int batchSize);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index d3bc97bb6cd0b8faf8ae108a0147d77854596e25..a9cf1f943c260a934564a19aecda28c24ccff43c 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -33,13 +33,13 @@ public:
   /**
    * Prefetch sparse matrix/ids only.
    */
-  void prefetch() { output_ = data_; }
+  void prefetch() override { output_ = data_; }
 
   /**
    * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
    * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
    */
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     copyDataToOutput(output_);
     if (FLAGS_show_layer_stat) {
@@ -50,9 +50,9 @@ public:
   /**
    * Data layer's backward propagation do nothing.
    */
-  virtual void backward(const UpdateCallback& callback) { (void)callback; }
+  void backward(const UpdateCallback& callback) override { (void)callback; }
 
-  virtual void copyOutputToOtherDevice() {
+  void copyOutputToOtherDevice() override {
     for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
       copyDataToOutput(outputOtherDevice_[i]);
     }
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index b3043cffd210feaf9ddaed096de762aa7e2a6139..f0fd044e5b83430a4028a227c7d5a31b6fa86f20 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -44,10 +44,11 @@ public:
 
   ~DataNormLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   int mode_;
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index fa53e2e4cfc8a220eeb2a637d7fe759f1744f9d5..686f1fa0543cb3629ac223316e595e642a9e7d76 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -27,14 +27,14 @@ class EosIdCheckLayer : public Layer {
 public:
   explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
     return ret;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
 
     const Argument& input = getInput(0);
@@ -42,7 +42,7 @@ public:
     output_.ids->isEqualTo(*input.ids, config_.eos_id());
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(eos_id, EosIdCheckLayer);
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.cpp b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
index 9ddccc202705c024076db795a9aeda0c823e9399..fdcf994cdb47f2409b045a1337332e2f4c304fbc 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -107,6 +107,10 @@ void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
   int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
 
   resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
+
+  CHECK_EQ(image->getWidth(),
+           static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
+
   real *imgData = image->getData() + startIdx * image->getWidth();
   MatrixPtr imageTmp =
       Matrix::create(imgData,
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index 8445642217cf3e83441ddd9beec80f99faf946bc..aabcdfc392d3e242df84c820c336d8b32c7cb04f 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -48,7 +48,8 @@ public:
 
   ~ExpandConvBaseLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   size_t getOutputSize();
   /**
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index de81a017e1bac38a5717e8c83a028f5408c0e084..60681690e5dd55b2e9aa4e1f25758db6033665a6 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -35,10 +35,11 @@ public:
 
   ~ExpandConvLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index 4a527d67995e255c65fea1f310551f8de5630030..00b8f241889fdd3f423d75dedd9068aa3674f190 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -34,10 +34,11 @@ public:
 
   ~ExpandConvTransLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 5c636144235cdb3800aa921464985616f8ee9203..c21b3350e2bc4b136eaf50f96799f479a13df6bd 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -53,10 +53,11 @@ public:
 
   ~ExpandLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index d023074c52167554358d0d4df7ec40cfba9da2a6..b3850f543af74abbddaac5bb0a32851f2d3297d0 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -46,10 +46,11 @@ public:
 
   ~FeatureMapExpandLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
@@ -95,6 +96,9 @@ void FeatureMapExpandLayer::forward(PassType passType) {
 
 void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
   MatrixPtr outGrad = getOutputGrad();
   size_t batchSize = getInput(0).getBatchSize();
   int imgSize = inGrad->getWidth();
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index ccd584585c97cb679332cbd10d6f3a1306ca5a54..64e7a050125aa92b414e58c7678bf87efd01103f 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -36,13 +36,14 @@ public:
   explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
-  void prefetch();
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index 930d9a056164e7c677adb53b7b67901364da1309..d3aeea921801da301b2829736059130aec14cef6 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -314,13 +314,13 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_ && bias_->getWGrad()) {
+  if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
 
   {
     int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
+    int curBatchSize = 0;
     AsyncGpuBlock asyncGpuBlock;
     for (int n = 0; n < numBatch; n++) {
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
@@ -330,16 +330,17 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       gruValue.resetOutputValue =
           (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      batchSize = outputValueTmp->getHeight();
+      curBatchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0
+               ? nullptr
+               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
 
       {
         if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), batchSize);
+          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
         } else {
-          GruCompute::forward<0>(gruValue, getSize(), batchSize);
+          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
         }
       }
     }
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 25770ce57fbaa4d16c9454d824800f2f0c7f957d..58dd760eb870e9570f8a406f098f69c5fdf6477a 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -50,17 +50,18 @@ class GatedRecurrentLayer : public Layer, public GruCompute {
 public:
   explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   void forwardSequence(int batchSize,
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index b77fdbb30e11b72b0c7de765df173204aa0b6851..4e29efd4612b18e655ba7674a3fd7890ce3f0e79 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -22,17 +22,18 @@ public:
 
   ~GetOutputLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     if (!Layer::init(layerMap, parameterMap)) return false;
     CHECK_EQ(1U, inputLayers_.size());
     CHECK_NE(inputArgument_[0], "");
     return true;
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     output_ = getPrev(0)->getOutput(inputArgument_[0]);
   }
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 REGISTER_LAYER(get_output, GetOutputLayer);
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 4a1006aa941f396c233a0cecfc38228f1f9fafe1..5b5cb25f9269a30f79d602b342411d0e6bfa429b 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -55,10 +55,11 @@ public:
 
   ~GruStepLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(gru_step, GruStepLayer);
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 70da3ac126e147387b20c5a97d0116a5a679e044..9afd40b1674680da962d6e51caa56b46279b70de 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -36,7 +36,7 @@ namespace paddle {
  * |   |- 5
  * |
  * |-*- 0
- * |- 1
+ *   |- 1
  * @endcode
  *
  * where * indicates an internal node, and each leaf node represents a class.
@@ -61,9 +61,10 @@ class HierarchicalSigmoidLayer : public Layer {
 public:
   explicit HierarchicalSigmoidLayer(const LayerConfig& config)
       : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 44fe1fb1fea4203a4a1cac67c581b13adda65966..eac7428571980baf6b2ddb8b2cc85b9c98afa5d6 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -43,10 +43,11 @@ public:
 
   ~InterpolationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(interpolation, InterpolationLayer);
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index c47943f81c01589eada4b825d54be5c69314b6fa..125aaf947f3c9d976b117667d1d1b7700a029cc6 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
 
 #include "AddtoLayer.h"
@@ -334,7 +335,8 @@ void Layer::showOutputStats() {
 
 void Layer::forwardActivation() {
   /* activation */
-  activation_->forward(output_);
+  auto status = activation_->forward(output_);
+  status.check();
 
   /* dropout */
   if (config_.drop_rate() > 0) {
@@ -372,14 +374,14 @@ void Layer::backwardActivation() {
     oGrad->dotMul(*oGrad, *dropOutMask_);
   }
 
-  activation_->backward(output_);
+  auto status = activation_->backward(output_);
+  status.check();
 }
 
 void Layer::forwardDropOut() {
   auto& outV = getOutputValue();
 
-  if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
-      passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
+  if (passType_ == PASS_TRAIN) {
     // new dropOutMask_ if dropOutMask_ is null ptr
     Matrix::resizeOrCreate(dropOutMask_,
                            outV->getHeight(),
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 6dfd48fb96618102b71e9f6de79a348dc7f62647..0ed482889d0cea884db3759620088575c5b10201 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -14,20 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <paddle/parameter/Argument.h>
 #include <functional>
 #include <memory>
 #include "ModelConfig.pb.h"
 #include "paddle/function/Function.h"
+#include "paddle/gserver/activations/ActivationFunction.h"
 #include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/parameter/Argument.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/Weight.h"
 #include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Util.h"
 
-#include <paddle/parameter/ParallelParameter.h>
-#include <paddle/parameter/Weight.h>
-#include "paddle/gserver/activations/ActivationFunction.h"
-
 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
 #define REGISTER_LAYER(__type_name, __class_name) \
@@ -108,9 +106,9 @@ protected:
 
 public:
   /**
-    * Wait until all input value ready.
-    * Called before Layer::forward() function.
-    */
+   * Wait until all input value ready.
+   * Called before Layer::forward() function.
+   */
   virtual void waitInputValue();
 
   /**
@@ -120,9 +118,9 @@ public:
   virtual void copyOutputToOtherDevice();
 
   /**
-    * Wait until all output grad ready and merge them to output_.grad.
-    * Called before Layer::backward() function.
-    */
+   * Wait until all output grad ready and merge them to output_.grad.
+   * Called before Layer::backward() function.
+   */
   virtual void waitAndMergeOutputGrad();
 
   /**
@@ -311,6 +309,7 @@ public:
         return *output->second;
       } else {
         LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
       }
     }
   }
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index b7f748f3bb8a419429956724131e81dfdbd274c6..dc3dc156792bdf32c3b948a292597d0e9eca5d8b 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -17,18 +17,12 @@ limitations under the License. */
 
 namespace paddle {
 
-LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad)
+LinearChainCRF::LinearChainCRF(int numClasses, real* para)
     : numClasses_(numClasses) {
   a_ = Matrix::create(para, 1, numClasses_);
   b_ = Matrix::create(para + numClasses_, 1, numClasses_);
   w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
 
-  if (grad) {
-    da_ = Matrix::create(grad, 1, numClasses_);
-    db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
-    dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
-  }
-
   ones_ = Matrix::create(1, numClasses_);
   ones_->one();
 
@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   return -ll;
 }
 
-void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
+void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
   MatrixPtr matX = Matrix::create(x, length, numClasses_);
-  MatrixPtr matDX = Matrix::create(dx, length, numClasses_);
-  MatrixPtr matGrad = Matrix::create(length, numClasses_);
+  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
   Matrix::resizeOrCreate(beta_, length, numClasses_);
   real* b = b_->getData();
-  real* dw = dw_ ? dw_->getData() : nullptr;
+  if (needWGrad) {
+    Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
+    matWGrad_->zeroMem();
+    da_ = matWGrad_->subRowMatrix(0, 1);
+    db_ = matWGrad_->subRowMatrix(1, 2);
+    dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
+  }
 
   real* alpha = alpha_->getData();
   real* beta = beta_->getData();
   real* expW = expW_->getData();
   real* expX = expX_->getData();
-  real* grad = matGrad->getData();
+  real* grad = matGrad_->getData();
 
   for (int i = 0; i < numClasses_; ++i) {
     beta[(length - 1) * numClasses_ + i] = exp(b[i]);
@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
     normalizeL1(beta + k * numClasses_, numClasses_);
   }
 
-  matGrad->dotMul(*alpha_, *beta_);
-  matGrad->rowNormalizeL1(*matGrad);
+  matGrad_->dotMul(*alpha_, *beta_);
+  matGrad_->rowNormalizeL1(*matGrad_);
   for (int k = 0; k < length; ++k) {
     grad[k * numClasses_ + s[k]] -= (real)1;
   }
-  matDX->add(*matGrad);
-  if (da_) {
-    da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
-  }
-  if (db_) {
-    db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
-  }
 
-  beta_->dotMul(*beta_, *expX_);
-  beta_->rowNormalizeL1(*beta_);
+  if (needWGrad) {
+    da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
+    db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
 
-  for (int k = 1; dw && k < length; ++k) {
-    real sum = 0;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
-               beta[k * numClasses_ + j];
+    beta_->dotMul(*beta_, *expX_);
+    beta_->rowNormalizeL1(*beta_);
+
+    real* dw = dw_->getData();
+    for (int k = 1; k < length; ++k) {
+      real sum = 0;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
+                 beta[k * numClasses_ + j];
+        }
       }
-    }
-    sum = 1 / sum;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
-                                   alpha[(k - 1) * numClasses_ + i] *
-                                   beta[k * numClasses_ + j];
+      sum = 1 / sum;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
+                                     alpha[(k - 1) * numClasses_ + i] *
+                                     beta[k * numClasses_ + j];
+        }
       }
+      dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
     }
-    dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
   }
 }
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index a905bf803dd5443ef8d4ad7702720a50a5220a9a..8daf1e14a6fa98bef41f4f32bff439df8302adfd 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -21,7 +21,7 @@ namespace paddle {
 class LinearChainCRF {
 public:
   /**
-   * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
    * The next numClasses values of para are for ending weights (\f$b\f$),
    * The remaning values are for transition weights (\f$w\f$).
@@ -34,7 +34,7 @@ public:
    * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
-  LinearChainCRF(int numClasses, real* para, real* grad);
+  LinearChainCRF(int numClasses, real* para);
 
   /**
    * Calculate the negative log likelihood of s given x.
@@ -45,29 +45,45 @@ public:
 
   /**
    * Calculate the gradient with respect to x, a, b, and w.
-   * The gradient of x will be stored in dx.
    * backward() can only be called after a corresponding call to forward() with
    * the same x, s and length.
-   * @note The gradient is added to dx and grad (provided at constructor).
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
    */
-  void backward(real* x, real* dx, int* s, int length);
+  void backward(real* x, int* s, int length, bool needWGrad);
 
   /**
    * Find the most probable sequence given x. The result will be stored in s.
    */
   void decode(real* x, int* s, int length);
 
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
 protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
   MatrixPtr w_;
+  MatrixPtr matWGrad_;
   MatrixPtr da_;
   MatrixPtr db_;
   MatrixPtr dw_;
   MatrixPtr ones_;
 
   MatrixPtr expX_;
+  MatrixPtr matGrad_;
   MatrixPtr alpha_;
   MatrixPtr beta_;
   MatrixPtr maxX_;
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index f49df2c412f05f74da455d41cdf7c9bd4b9ec2e2..c45a52d2e9aaf41a8e02495cc2deae60ab13650a 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -74,17 +74,18 @@ class LstmLayer : public Layer, public LstmCompute {
 public:
   explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
 
-  bool init(const LayerMap &layerMap, const ParameterMap &parameterMap);
+  bool init(const LayerMap &layerMap,
+            const ParameterMap &parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback &callback);
+  void backward(const UpdateCallback &callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index 5fc6474b8653f4c7dac284e11d88f803405169a3..568277a90c62c73a811dcbf66782a4bdc4021b81 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -35,10 +35,11 @@ public:
 
   ~LstmStepLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(lstm_step, LstmStepLayer);
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index fb41af563195496a57eafcc52b49eadac697fa0a..be0f2a07d4aae253b7b18dbe406c4b94bf96bc8e 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -181,11 +181,12 @@ class MDLstmLayer : public LstmLayer {
 public:
   explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   void forwardOneSequence(int start, CoordIterator& coordIter);
@@ -506,9 +507,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
-  activationGate_->forward(frameInputGate_[idxCurr]);
-  activationGate_->forward(frameForgetGate_[idxCurr]);
-  activation_->forward(frameInputNode_[idxCurr]);
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
 
   frameState_[idxCurr].value->zeroMem();
   for (int i = 0; i < numDims_; i++) {
@@ -530,10 +534,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
 
   frameOutputGate_[idxCurr].value->addDotMul(
       *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  activationGate_->forward(frameOutputGate_[idxCurr]);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  activationState_->forward(framePreOutput_[idxCurr]);
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
 
   frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
                                       *frameOutputGate_[idxCurr].value);
@@ -640,12 +646,12 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
 
   framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                         *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
   frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
 
   frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                          *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
 
   frameState_[idxCurr].grad->addDotMul(
       *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
@@ -702,9 +708,9 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     }
   }
 
-  activationGate_->backward(frameInputGate_[idxCurr]);
-  activationGate_->backward(frameForgetGate_[idxCurr]);
-  activation_->backward(frameInputNode_[idxCurr]);
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
 
   if (bias_->getWGrad()) {
     for (int i = 0; i < numDims_; i++) {
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 80555f3f7b324100c059c3356a4a2e462bc6face..9e72b167cd963ae4928bf85503214dd7cee31148 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -30,8 +30,8 @@ private:
 public:
   explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
 
@@ -40,7 +40,7 @@ public:
     return ret;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     const Argument& input = getInput(0);
     size_t batchSize = input.getBatchSize();
@@ -54,7 +54,7 @@ public:
     input.value->rowMax(*output_.ids, *output_.in);
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(maxid, MaxIdLayer);
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 472ee0ccca196250f4b81fc1e921aaee5f352b7e..baa58ca2d7a6970f0d2f3ef6f8609404c82efa30 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -42,14 +42,13 @@ protected:
 public:
   explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
-  ~MaxLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     return SequencePoolLayer::init(layerMap, parameterMap);
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 59c2245e0d6490d4f8e1b77b1c88267747aaa63a..73fd8536be56b2c620fbfdea1937f3acd593bf05 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -45,10 +45,11 @@ public:
   explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
   virtual ~MaxOutLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 9655a152c7bc96fb3941fcbd9db4ff71a59e4ebe..755c9deb8b1be34b6f44a7b30b107f99102a3853 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -35,21 +35,22 @@ public:
 
   ~MixedLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void prefetch();
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
-  virtual void resetState();
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void resetState() override;
   /**
    * setState() should be called after getState().
    * Argument state consists of all projections states.
    */
-  virtual void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
   /**
    * Return state which consists of all projections states.
    */
-  virtual LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   std::vector<std::unique_ptr<Projection>> projections_;
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index d09720c5255747df11d4d7367f67a245e63e6846..297972b3cd9e4dfba94e2597053ab7c7c560c9dd 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -69,10 +69,11 @@ public:
 
   ~MultiplexLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 private:
   /**
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 5ab765247f63dfe6e6651ca4d27dc7183a9f33e1..0bc2ef11829337d9b765ef00066289494eb984b3 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -61,7 +61,8 @@ public:
         rand_(0, config.num_classes() - 1),
         prepared_(false) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     /* Initialize the basic parent class */
     Layer::init(layerMap, parameterMap);
 
@@ -146,7 +147,7 @@ public:
     prepared_ = true;
   }
 
-  void prefetch() {
+  void prefetch() override {
     prepareSamples();
     IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
     int* ids = labelIds_->getData();
@@ -163,7 +164,7 @@ public:
     }
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
 
     CHECK(!useGpu_) << "GPU is not supported";
@@ -193,12 +194,13 @@ public:
       forwardOneInput(l);
     }
 
-    activation_->forward(sampleOut_);
+    auto status = activation_->forward(sampleOut_);
+    status.check();
 
     forwardCost();
   }
 
-  void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     Matrix::resizeOrCreate(sampleOut_.grad,
                            1,
                            samples_.size(),
@@ -207,7 +209,8 @@ public:
 
     backwardCost();
 
-    activation_->backward(sampleOut_);
+    auto status = activation_->backward(sampleOut_);
+    status.check();
 
     if (biases_->getWGrad()) {
       backwardBias(callback);
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index 3db0af2515ee9f64aa6c0b0a441e88562d9e398e..e094078bfe86e30c06e1b80ebc04c8213fe9abcf 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
     return new ResponseNormLayer(config);
   } else if (norm == "cmrnorm-projection") {
     return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
   } else {
     LOG(FATAL) << "Unknown norm type: " << norm;
     return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 011bab8fdedab00b336290a245b82de07496b554..7c238ac944e52c3a83c2aa5deac18de3aff6db61 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -30,7 +30,8 @@ class NormLayer : public Layer {
 public:
   explicit NormLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     Layer::init(layerMap, parameterMap);
     return true;
   }
@@ -56,11 +57,43 @@ protected:
 public:
   explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType) { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) override {
     LOG(FATAL) << "Not implemented";
   }
 };
 
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 262d757c67e105a8d65619eed91de65d34cfe35e..4331009de7e98d2326049e563e46a55a20366507 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -59,7 +59,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
 
 void CMRProjectionNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   /* note: one sample correspond to one row */
   MatrixPtr input = inputLayers_[0]->getOutputValue();
@@ -67,34 +66,36 @@ void CMRProjectionNormLayer::forward(PassType passType) {
   int size = getSize();
   resetOutput(batchSize, size);
 
-  MatrixPtr outV = getOutputValue();
-
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
-  dims_ = {batchSize, channels_, imgSizeH_, imgSizeW_};
-  forward_[0]->calc(
-      {Tensor(input->getData(), dims_)},
-      {Tensor(outV->getData(), dims_), Tensor(denoms_->getData(), dims_)},
-      {});
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+
+  // prepare forward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
+
+  forward_[0]->calc(inputs, outputs);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
-  MatrixPtr localGrad = getOutputGrad();
-  MatrixPtr localOutV = getOutputValue();
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
-
-  backward_[0]->calc({Tensor(preOutV->getData(), dims_),
-                      Tensor(localOutV->getData(), dims_),
-                      Tensor(localGrad->getData(), dims_),
-                      Tensor(denoms_->getData(), dims_)},
-                     {Tensor(preOutGrad->getData(), dims_)},
-                     {});
+
+  // prepare backward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 6b2c5dde0d74db4b292d5006d19ce54d3194017e..2997ae8848c438fa13037ccf03c1faca9ad73224 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -36,11 +36,12 @@ public:
 
   size_t getSize();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  Dims dims_;
+  TensorShape shape_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index b606e4436567eb2a8df9fd501a2af8c8aa1d2fdf..283fdb003a2bb9474eac7a379ceb2c02027cfc5f 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -38,10 +38,11 @@ public:
 
   ~OuterProdLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(out_prod, OuterProdLayer);
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a5ed7e057aea8f065ee752f8c0f0d2d9bdddfc8b
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pad, PadLayer);
+
+bool PadLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  auto& pad_conf = config_.inputs(0).pad_conf();
+  auto& img_conf = pad_conf.image_conf();
+  CHECK_EQ(config_.inputs_size(), 1);
+  inDims_ = TensorShape(
+      {0,
+       img_conf.channels(),
+       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
+       img_conf.img_size()});
+
+  CHECK_EQ(2, pad_conf.pad_c_size());
+  CHECK_EQ(2, pad_conf.pad_h_size());
+  CHECK_EQ(2, pad_conf.pad_w_size());
+  padc_ = {pad_conf.pad_c(0), pad_conf.pad_c(1)};
+  padh_ = {pad_conf.pad_h(0), pad_conf.pad_h(1)};
+  padw_ = {pad_conf.pad_w(0), pad_conf.pad_w(1)};
+
+  outDims_ = TensorShape(4);
+  setOutDims(0);
+
+  createFunction(forward_,
+                 "Pad",
+                 FuncConfig()
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
+  createFunction(backward_,
+                 "PadGrad",
+                 FuncConfig()
+                     .set("channel", padc_)
+                     .set("height", padh_)
+                     .set("width", padw_));
+
+  return true;
+}
+
+void PadLayer::setOutDims(const size_t batchSize) {
+  outDims_.reshape({batchSize,
+                    inDims_[1] + padc_[0] + padc_[1],
+                    inDims_[2] + padh_[0] + padh_[1],
+                    inDims_[3] + padw_[0] + padw_[1]});
+}
+
+void PadLayer::setTensorDim(const size_t batchSize) {
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  setOutDims(batchSize);
+}
+
+void PadLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  setTensorDim(batchSize);
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("PadForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void PadLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe9388d8cc260ed599af0113361f4687f3f4a18b
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according padc_, padh_ and padw_.
+ */
+class PadLayer : public Layer {
+public:
+  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PadLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  void setOutDims(const size_t batchSize);
+  void setTensorDim(const size_t batchSize);
+
+  std::vector<uint32_t> padc_;
+  std::vector<uint32_t> padh_;
+  std::vector<uint32_t> padw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index a82497fc01ca1f63719a905c7545911a7e05289b..9a11b81ebf1f5c06355fc107b00aa69b65148ed5 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -56,9 +56,10 @@ public:
 
   ~ParameterReluLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 318b89d7c2bce896d183eba8c48c230d962918a5..d43292ad2d4bbe1229ca59ca21bee92c9ec006a3 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -46,7 +46,8 @@ public:
    */
   static Layer* create(const LayerConfig& config);
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 3dc6af2f0e9fb1a12eca7bc0c531a2e7b151fb8a..e31116de8ccb1f6b847c9fff47961bedfad1a79c 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -40,7 +40,7 @@ public:
 
   size_t getSize();
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 64fecab5b08354ceea8b290b78eede72d24a98a2..31c34b43e2995a2bf7f4d16629a8172a7e76c8e1 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -40,10 +40,11 @@ public:
 
   ~PowerLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(power, PowerLayer);
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index ac7f658864fee6812ea89d1dbd84ad4db94e3035..de198af111be4200dd1b240f6de9464e3f43b06d 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -19,38 +19,17 @@ namespace paddle {
 class PrintLayer : public Layer {
 public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {}
-};
 
-void PrintLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const auto& argu = getInput(i);
-    const std::string& name = inputLayers_[i]->getName();
-    if (argu.value) {
-      std::ostringstream os;
-      argu.value->print(os);
-      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-    }
-    if (argu.ids) {
-      std::ostringstream os;
-      argu.ids->print(os, argu.ids->getSize());
-      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-    }
-    if (auto startPos = argu.sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-    }
-    if (auto subStartPos = argu.subSequenceStartPositions) {
-      std::ostringstream os;
-      subStartPos->getVector(false)->print(os, subStartPos->getSize());
-      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                << os.str();
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      getInput(i).printValueString(LOG(INFO),
+                                   "layer=" + inputLayers_[i]->getName() + " ");
     }
   }
-}
+
+  void backward(const UpdateCallback& callback) override {}
+};
 
 REGISTER_LAYER(print, PrintLayer);
 
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 36ace7597cd66cc2d83353ec999a75c79dd1e33e..331bc7672ec0d39a7317c39f1d14e8dcadea471a 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * @brief A layer for generating priorbox locations and variances.
  * - Input: Two and only two input layer are accepted. The input layer must be
- *        be a data output layer and a convolution output layer.
+ *          be a data output layer and a convolution output layer.
  * - Output: The priorbox locations and variances of the input data.
  * Reference:
  *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
@@ -30,10 +30,11 @@ namespace paddle {
 class PriorBoxLayer : public Layer {
 public:
   explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
 
 protected:
   int numPriors_;
@@ -44,27 +45,32 @@ protected:
   MatrixPtr buffer_;
 };
 
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
 bool PriorBoxLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   Layer::init(layerMap, parameterMap);
   auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
   std::copy(pbConf.min_size().begin(),
             pbConf.min_size().end(),
             std::back_inserter(minSize_));
   std::copy(pbConf.max_size().begin(),
             pbConf.max_size().end(),
             std::back_inserter(maxSize_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(aspectRatio_));
   std::copy(pbConf.variance().begin(),
             pbConf.variance().end(),
             std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
   // flip
-  int inputRatioLength = aspectRatio_.size();
-  for (int index = 0; index < inputRatioLength; index++)
-    aspectRatio_.push_back(1 / aspectRatio_[index]);
-  aspectRatio_.push_back(1.);
+  int inputRatioLength = tmp.size();
+  for (int index = 0; index < inputRatioLength; index++) {
+    aspectRatio_.push_back(tmp[index]);
+    aspectRatio_.push_back(1 / tmp[index]);
+  }
   numPriors_ = aspectRatio_.size();
   if (maxSize_.size() > 0) numPriors_++;
   return true;
@@ -93,12 +99,12 @@ void PriorBoxLayer::forward(PassType passType) {
     for (int w = 0; w < layerWidth; ++w) {
       real centerX = (w + 0.5) * stepW;
       real centerY = (h + 0.5) * stepH;
-      int minSize = 0;
+      real minSize = 0;
       for (size_t s = 0; s < minSize_.size(); s++) {
         // first prior.
         minSize = minSize_[s];
-        int boxWidth = minSize;
-        int boxHeight = minSize;
+        real boxWidth = minSize;
+        real boxHeight = minSize;
         // xmin, ymin, xmax, ymax.
         tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
         tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -111,7 +117,7 @@ void PriorBoxLayer::forward(PassType passType) {
           CHECK_EQ(minSize_.size(), maxSize_.size());
           // second prior.
           for (size_t s = 0; s < maxSize_.size(); s++) {
-            int maxSize = maxSize_[s];
+            real maxSize = maxSize_[s];
             boxWidth = boxHeight = sqrt(minSize * maxSize);
             tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
             tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -144,6 +150,5 @@ void PriorBoxLayer::forward(PassType passType) {
   MatrixPtr outV = getOutputValue();
   outV->copyFrom(buffer_->data_, dim * 2);
 }
-REGISTER_LAYER(priorbox, PriorBoxLayer);
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 55e0fdfb9048c02b2dcd474c6887eee180328260..e4c2b483d2fa4032735858dab17647592791a9c7 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -45,17 +45,18 @@ class RecurrentLayer : public Layer {
 public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   /**
@@ -217,21 +218,22 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     if (prevOutput_) {
       frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
     }
-    activation_->forward(frameOutput_[start]);
+    activation_->forward(frameOutput_[start]).check();
+
     for (int i = 1; i < length; ++i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
     if (prevOutput_) {
       prevOutput_->assign(*frameOutput_[start + length - 1].value);
     }
   } else {
-    activation_->forward(frameOutput_[start + length - 1]);
+    activation_->forward(frameOutput_[start + length - 1]).check();
     for (int i = length - 2; i >= 0; --i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
   }
 }
@@ -280,11 +282,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   MatrixPtr weightT = weight_->getW()->getTranspose();
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i - 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start]);
+    activation_->backward(frameOutput_[start]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start, length - 1)->getTranspose(),
@@ -294,11 +296,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i + 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start + length - 1]);
+    activation_->backward(frameOutput_[start + length - 1]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
@@ -333,7 +335,7 @@ void RecurrentLayer::forwardBatch(int batchSize,
       }
       Argument arg;
       arg.value = batch2;
-      activation_->forward(arg);
+      activation_->forward(arg).check();
     }
   }
   batchValue_->copyBackSeq(*output_.value);
@@ -363,7 +365,7 @@ void RecurrentLayer::backwardBatch(int batchSize,
       Argument arg;
       arg.value = batch1;
       arg.grad = batch2;
-      activation_->backward(arg);
+      activation_->backward(arg).check();
 
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index af8dd61d84e2e53ca26dc054d0516e62ab7aa216..78a74ff19a38cd205f3a46900bf716e2e1b1e4d5 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -33,15 +33,15 @@ public:
   void initSubNetwork(NeuralNetwork* rootNetwork,
                       const ModelConfig& config,
                       const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu);
+                      bool useGpu) override;
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
     const std::vector<Argument> inArgs;
     std::vector<Argument> outArgs;
     network_->forward(inArgs, &outArgs, passType);
   }
-  void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
     network_->backward(nullptr);
 
@@ -53,7 +53,8 @@ public:
   /**
    * @see Layer.accessSubNetwork
    */
-  void accessSubNetwork(const std::function<void(NeuralNetwork&)>& callback) {
+  void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) override {
     callback(*network_);
   }
 
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 7fcb3adea01b9d16394ee90b751b10902dc3a190..eb3b63c106901f89dd75cc2a495477b240d40e3c 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -20,18 +20,19 @@ namespace paddle {
 /**
  * @brief A layer for resizing a minibatch matrix h*w to h'*w'
  * @note
- * origin matrix height * witdth)
+ * origin matrix height * width)
  * resize matrix: (height * width / size) * size
  */
 class ResizeLayer : public Layer {
 public:
   explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 };
 
 REGISTER_LAYER(resize, ResizeLayer);
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/gserver/layers/RotateLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c71088d786ab218bf0f71b577985c023dd1436f
--- /dev/null
+++ b/paddle/gserver/layers/RotateLayer.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RotateLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(rotate, RotateLayer);
+
+bool RotateLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  height_ = config_.height();
+  width_ = config_.width();
+  CHECK_GT(height_, 0);
+  CHECK_GT(width_, 0);
+  return true;
+}
+
+void RotateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr input = getInputValue(0);
+  batchSize_ = input->getHeight();
+  size_ = input->getWidth();
+  CHECK_GE(size_, height_ * width_);
+  CHECK_EQ(size_ % (height_ * width_), 0)
+      << "total size_ is not dividable by (height_ * width_), i.e., "
+      << "channel number should be an integer";
+  channels_ = size_ / (height_ * width_);
+
+  resizeOutput(batchSize_, size_);
+
+  MatrixPtr outV = getOutputValue();
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSample =
+          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSample =
+          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
+                         width_,
+                         height_,
+                         false,
+                         useGpu_);
+      inputSample->rotate(outputSample, false, true /* clock-wise */);
+    }
+  }
+
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+
+void RotateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  // the grad should be rotated in the reverse direction
+  MatrixPtr preGrad = getInputGrad(0);
+
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSampleGrad =
+          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSampleGrad = Matrix::create(
+          outputGrad->getData() + b * size_ + c * height_ * width_,
+          width_,
+          height_,
+          false,
+          useGpu_);
+      MatrixPtr tmpGrad = nullptr;
+      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
+      inputSampleGrad->add(*tmpGrad);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..d05c2065cb1cb81452c54ee1858c34cb46e6c7f6
--- /dev/null
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+ */
+
+class RotateLayer : public Layer {
+public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 59ff5d41b529099277375cd5e1b498f3331c3b0a..2538d99bb71fa1ce6546730b817a49347fe3c5d8 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -35,8 +35,8 @@ public:
   explicit SamplingIdLayer(const LayerConfig& config)
       : Layer(config), rand1_(0, 1) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
     if (useGpu_) {
@@ -48,7 +48,7 @@ public:
     return ret;
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     if (useGpu_) {
       for (size_t i = 0; i < inputLayers_.size(); i++) {
@@ -83,7 +83,7 @@ public:
     output_.ids->copyFrom(ids.data(), batchSize);
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(sampling_id, SamplingIdLayer);
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 7f0084be6b57f5ce8245609e64c744c1a049a925..a38ee0857a767981eb24e79e96bf6115e9c63720 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -37,10 +37,11 @@ public:
 
   ~ScalingLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(scaling, ScalingLayer);
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 5eacff6b7143996130bea64766ef42c66f4c7310..d9a91de8a6f4daf514f089a3d63cb519223bfdd0 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -192,7 +192,8 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
                                nnz,
                                /*trans=*/false,
                                /*useGpu=*/useGpu_);
-    activation_->forward(arg);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
     // eg. multi-class-cross-entropy
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index bdf9a4652cc71710d1d33e8b085c5aec28f6f806..99126fdba542bd142341039af27c3af72b391ca7 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -65,9 +65,10 @@ public:
       : Layer(config), selCols_(nullptr) {}
 
   ~SelectiveFullyConnectedLayer() {}
-  void prefetch();
+  void prefetch() override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
@@ -90,8 +91,8 @@ public:
   void fillSelectiveData(
       const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 private:
   /**
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index 069bc26e602ff7d925b4115d12388b6716676b29..4b24d8f0c852e1bdc887d4ee1465b9ad05d210bb 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -21,9 +21,11 @@ namespace paddle {
 
 /**
  * A layer for concatenating the first sequence with the second sequence
- * following the first
- * Input: two sequences each containing some instances
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
  * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
  */
 
 class SequenceConcatLayer : public Layer {
@@ -35,10 +37,11 @@ public:
 
   ~SequenceConcatLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqconcat, SequenceConcatLayer);
@@ -167,13 +170,17 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
     size_t rightNumIns = 0;
     for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
       leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-          ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
       offset += leftNumIns;
 
       rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-          ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
       offset += rightNumIns;
     }
   }
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 4bfce766c769f4be2e5cc7bf691d539b1d307a47..944c7051668dccf39dd2ace14986d43c8a14e452 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -25,6 +25,11 @@ namespace paddle {
  * Input: a sequence
  * If SequenceLevel = kNonseq:
  *   Output: a sequence containing only the last instance of the input sequence
+ *   If stride_ > 0:
+ *      Output: a shorten sequence. The operation of getting last instance of a
+ *              sequence is independently performed on every slice of the input
+ *              sequence, which is obtained by sliding a window with the window
+ *              size set to stride_.
  * If SequenceLevel = kSeq:
  *   Check input sequence must has sub-sequence
  *   Output: a sequence containing only the last instance of each sub-sequence
@@ -37,17 +42,17 @@ class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
   MatrixPtr tmpSrc_;
   MatrixPtr tmpDest_;
+  std::vector<int> instanceIds_;
 
 public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
-  ~SequenceLastInstanceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
@@ -55,6 +60,7 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                      const ParameterMap& parameterMap) {
   SequencePoolLayer::init(layerMap, parameterMap);
+  reversed_ = config_.select_first();
 
   tmpSrc_ =
       Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
@@ -67,7 +73,8 @@ bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
 void SequenceLastInstanceLayer::forward(PassType passType) {
   SequencePoolLayer::forward(passType);
 
-  const int* starts = startPositions_->getData(false);
+  auto starts = (stride_ > 0) ? stridePositions_->getData()
+                              : startPositions_->getData(false);
   MatrixPtr inputValue = getInputValue(0);
   MatrixPtr outputValue = getOutputValue();
 
@@ -75,9 +82,10 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
 
+    instanceIds_.clear();
     for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
-      int insId =
-          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
+      int insId = reversed_ ? starts[seqId] : starts[seqId + 1] - 1;
+      instanceIds_.push_back(insId);
 
       outputValue->subMatrix(seqId, 1, tmpDest_)
           ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
@@ -97,18 +105,13 @@ void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
 
   MatrixPtr inputGrad = getInputGrad(0);
   MatrixPtr outputGrad = getOutputGrad();
-  const int* starts = startPositions_->getData(false);
-  size_t numSequences = startPositions_->getSize() - 1;
 
   if (inputGrad) {
     AsyncGpuBlock asyncGpuBlock;
     REGISTER_TIMER_INFO("SequenceLastInstanceLayerBackward", getName().c_str());
 
-    for (size_t seqId = 0; seqId < numSequences; ++seqId) {
-      int insId =
-          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
-
-      inputGrad->subMatrix(insId, 1, tmpDest_)
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
+      inputGrad->subMatrix(instanceIds_[seqId], 1, tmpDest_)
           ->add(*(outputGrad->subMatrix(seqId, 1, tmpSrc_)));
     }
   }
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 35260ca912d5d0e00213ffb7074bd8963da265da..235d9a9b0f0653df5c0b671092df9e195f08fc48 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -37,6 +37,7 @@ bool SequencePoolLayer::init(const LayerMap& layerMap,
   } else {
     LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
   }
+  stride_ = config_.seq_pool_stride();
   setNeedSequenceInfo(false);
   return true;
 }
@@ -55,19 +56,25 @@ void SequencePoolLayer::forward(PassType passType) {
   CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
   CHECK_EQ(newBatchSize_, starts->getSize() - 1);
 
-  resetOutput(newBatchSize_, dim);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-  }
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
    * case, we should compute the new sequenceStartPositions.
-  */
+   */
   if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+    output_.degradeSequence(input);
+  }
+  if (stride_ > 0) {
+    CHECK_EQ(input.hasSubseq(), 0UL)
+        << "sequence stride pooling is invalid for hasSubseq now";
+    output_.poolSequenceWithStride(
+        input, stride_, &stridePositions_, reversed_);
+    newBatchSize_ = stridePositions_->getSize() - 1;
   }
+
+  resetOutput(newBatchSize_, dim);
 }
 
 void SequencePoolLayer::backward(const UpdateCallback& callback) {
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index aa9c132586e55d0f6bccec1689db60145ca2d43f..293d1bf27823ffb0ebddba95461883d646f159ae 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -26,6 +26,10 @@ namespace paddle {
  *    Output: output size is the number of input sequences (NOT input instances)
  *    output[i] = seqlastin/average/max_{for each instance in this
  * sequence}{input[i]}
+ *    If stride_ > 0:
+ *        Check input sequence must not have sub-sequence
+ *        Output: a shorten sequence, pooling is performed upon a small local
+ *                area
  * If SequenceLevel = kSeq:
  *    Check input sequence must has sub-sequence
  *    Output: output size is the number of input sub-sequences
@@ -42,16 +46,20 @@ protected:
   enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
   size_t newBatchSize_;
   ICpuGpuVectorPtr startPositions_;
+  int stride_;
+  // Store the start position of each window.
+  IVectorPtr stridePositions_;
+  // Whether the input sequence is reversed or not.
+  bool reversed_ = false;
 
 public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual ~SequencePoolLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 23924b0490851ad3c3c74d77e7abd8b0af8fc234..433592953b220eda4db4634124a57a2074cef4c0 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -20,9 +20,12 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for reshaping the sequence
- * Input: a sequence
- * Output: a sequence
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
  */
 
 class SequenceReshapeLayer : public Layer {
@@ -34,12 +37,11 @@ protected:
 public:
   explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SequenceReshapeLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index b678f414b6d76fa26818cb379fb0f0fb8fc7ec09..faf98744a7fdcf9c2c1712d783f153739ccc8eca 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -39,12 +39,11 @@ class SlopeInterceptLayer : public Layer {
 public:
   explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SlopeInterceptLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 32e88cf141a667d9dffbe7dcba46e9fde721f9e7..7d3cb80443801a947e3d529beb002561c4ac1964 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -43,9 +43,8 @@ protected:
 public:
   explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SpatialPyramidPoolLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   ProjectionConfig getConfig(size_t sizeX_,
                              size_t sizeY_,
@@ -54,7 +53,7 @@ public:
                              std::string& poolType_);
   size_t getSize();
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index c52fbee26232ad6eb09f84315a57c73e6aa02eb0..19b7ad1869af98e6313fe85a40203fd1e84f31d6 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -35,12 +35,11 @@ protected:
 public:
   explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SubSequenceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(subseq, SubSequenceLayer);
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index aa99b49380d3682ccf3d89220c0c68f22e458271..00f8519550bcff9bb706b1a28dc0dfcdc06cc54a 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -41,12 +41,11 @@ protected:
 public:
   explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SumToOneNormLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index ac38ffb620570320497446a6825ca2273b73facc..43992f692d3ce40fa095c8e0190bae01dc2ac3c1 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -44,13 +44,12 @@ protected:
 public:
   explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~TensorLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index d1fa90f38415c53bd1c56df4a6c4be0508004bc6..4150f1727d8a1a3c1ed21b01944040977d2db315 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -56,7 +56,14 @@ void TransLayer::backward(const UpdateCallback& callback) {
     return;
   }
   MatrixPtr preGrad = getInputGrad(0);
-  outputGrad->transpose(preGrad, false);
+  if (preGrad) {
+    MatrixPtr transGrad = Matrix::create(preGrad->getHeight(),
+                                         preGrad->getWidth(),
+                                         /* trans= */ false,
+                                         preGrad->useGpu());
+    outputGrad->transpose(transGrad, false);
+    preGrad->add(*transGrad);
+  }
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index b43fa1ebfb003226daed724b4ede3006545e8b07..be10bb74f6b218f0b12dc9f20db9a6ee8af7a478 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 /**
- * A layer for transposition.
+ * A layer for transposing a minibatch matrix.
  * \f[
      y = x^\mathrm{T}
  * \f]
@@ -32,9 +32,10 @@ class TransLayer : public Layer {
 public:
   explicit TransLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index 4c1de7b3b7d6975c2693eb065f7d3e19cc51a95c..c8b2634a1366ed03846f2331726d04232b5d32ee 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -26,7 +26,8 @@ class ValidationLayer : public Layer {
 public:
   explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
@@ -37,13 +38,13 @@ public:
     return inputLayers_[2];
   }
 
-  virtual void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
 
-  virtual void onPassEnd() = 0;
+  void onPassEnd() override = 0;
 };
 
 /*
@@ -57,11 +58,12 @@ public:
         cpuLabel_(nullptr),
         cpuWeight_(nullptr) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
   struct PredictionResult {
     PredictionResult(real __out, int __label) : out(__out), label(__label) {}
@@ -86,11 +88,12 @@ public:
   explicit PnpairValidation(const LayerConfig& config)
       : ValidationLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
 private:
   bool passBegin_;
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
index 3d9ae9249af66dd085f5b6bb7a3c09d8b2276a24..7e8d7379d267886805db2eb7983a4dabbf949914 100644
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -30,9 +30,10 @@ public:
   explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
   ~WarpCTCLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   /**
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0caa5e1e11e6d42fadfa87149814c4b77b3b6271..3c4128b5b8a0ea420bd3027b9a36e5f75087c3cb 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+################ test_CRFLayerGrad ####################
+add_unittest_without_exec(test_CRFLayerGrad
+    test_CRFLayerGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_CRFLayerGrad
+    COMMAND test_CRFLayerGrad)
+
+
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
     LayerGradUtil.cpp)
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index ae016e74eaa84f7c43a30c09c8c4577e25360c4e..a0b1cd471dd02fd20bb2247395bdb74651610bbf 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
   if (weights) {
     outArgs[0].value->dotMul(*outArgs[0].value, *weights);
   }
-  return Argument::sumCosts(outArgs);
+  return Argument::sum(outArgs);
 }
 
 real getDiffAndPrint(real newCost1,
@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
 
     std::vector<Argument> args;
     args.push_back(out);
-    EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed";
+    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
     for (size_t seqId = 0; seqId < numSequences; ++seqId) {
       start[seqId] += seqLens[seqId];
     }
@@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf,
     outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
   }
 
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
   LOG(INFO) << " cost " << cost;
   EXPECT_FALSE(std::isnan(cost));
 
@@ -778,8 +778,10 @@ void testProjectionGrad(ProjectionConfig conf,
   config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
   config.layerConfig.set_bias_size(config.biasSize);
   config.layerConfig.set_shared_biases(sharedBias);
-  config.inputDefs.push_back(
-      {inputType, "layer_0", conf.input_size(), parameterSize});
+  config.inputDefs.push_back({inputType,
+                              "layer_0",
+                              static_cast<size_t>(conf.input_size()),
+                              parameterSize});
   *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
   config.testState = testState;
   testLayerGrad(config, "mixed", batchSize, false, useGpu);
diff --git a/paddle/gserver/tests/img_conv_cudnn.py b/paddle/gserver/tests/img_conv_cudnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3934607fa41f9b6d401f1c9ff4aec6715786799b
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_cudnn.py
@@ -0,0 +1,32 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name="input", size=8 * 16 * 16)
+conv = img_conv_layer(
+    input=data,
+    filter_size=1,
+    filter_size_y=1,
+    num_channels=8,
+    num_filters=16,
+    stride=1,
+    bias_attr=True,
+    act=LinearActivation(),
+    groups=2,
+    layer_type="cudnn_conv")
+
+outputs(conv)
diff --git a/paddle/gserver/tests/img_conv_exconv.py b/paddle/gserver/tests/img_conv_exconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad5a8ba2bde17000ca3d7057c6f399ae28d938b0
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_exconv.py
@@ -0,0 +1,32 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name="input", size=8 * 16 * 16)
+conv = img_conv_layer(
+    input=data,
+    filter_size=1,
+    filter_size_y=1,
+    num_channels=8,
+    num_filters=16,
+    stride=1,
+    bias_attr=True,
+    act=LinearActivation(),
+    groups=2,
+    layer_type="exconv")
+
+outputs(conv)
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index 68d150d553588c864de56ce1e6f283cc42fbbf2f..50f2d89d0271b2eaa460e57636eb09b6d6aeda18 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -48,8 +48,7 @@ lstm = lstmemory_group(
     size=hidden_dim,
     act=TanhActivation(),
     gate_act=SigmoidActivation(),
-    state_act=TanhActivation(),
-    lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    state_act=TanhActivation())
 
 lstm_last = last_seq(input=lstm)
 
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 88cb42798baff79fa6a86ef11dabf1781575c0b4..c01b95f7a29ae73c2b3ccd5b56ad1d316cbc72ec 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input):
         size=hidden_dim,
         act=TanhActivation(),
         gate_act=SigmoidActivation(),
-        state_act=TanhActivation(),
-        lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+        state_act=TanhActivation())
     return lstm_output
 
 
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df14449291e9ec08f45718de07bbb101f6dbea58
--- /dev/null
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/LinearChainCRF.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
+              << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              numClasses,
+                              numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 207fc0566fcf4a0d2e971f3c169a14a64146155b..54b72375b743fe025e0ded5fdbce5699a0b4be1a 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -34,8 +34,7 @@ DECLARE_double(checkgrad_eps);
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(prev_batch_state);
 
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
 MatrixPtr doOneConvTest(size_t imgSize,
                         size_t output_x,
                         size_t stride,
@@ -46,22 +45,35 @@ MatrixPtr doOneConvTest(size_t imgSize,
                         size_t groups,
                         MatrixPtr& inputData,
                         real* param,
-                        bool useGpu) {
+                        bool useGpu,
+                        bool isDeconv = false) {
   TestConfig config;
   config.biasSize = numfilters;
+  string layerType;
   if (useGpu) {
-    config.layerConfig.set_type("cudnn_conv");
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
   } else {
-    config.layerConfig.set_type("exconv");
+    layerType = (isDeconv) ? "exconvt" : "exconv";
   }
+  config.layerConfig.set_type(layerType);
   config.layerConfig.set_num_filters(numfilters);
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
   size_t weightSize = channel * filter_size * filter_size *
                       config.layerConfig.num_filters() / groups;
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+  if (isDeconv) {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
+
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(filter_size);
@@ -72,12 +84,15 @@ MatrixPtr doOneConvTest(size_t imgSize,
   conv->set_stride(stride);
   conv->set_stride_y(stride);
   conv->set_groups(groups);
-  conv->set_filter_channels(channel / groups);
   conv->set_img_size(imgSize);
   conv->set_output_x(output_x);
 
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
+  if (isDeconv) {
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
+
   config.layerConfig.set_name("conv");
 
   std::vector<DataLayerPtr> dataLayers;
@@ -105,6 +120,8 @@ MatrixPtr doOneConvTest(size_t imgSize,
 TEST(Layer, convParaUnified) {
 #ifndef PADDLE_ONLY_CPU
   MatrixPtr input, resultCpu, resultGpu;
+
+  /// TEST1 for conv ///
   input = Matrix::create(1, 4 * 4, false, false);
   real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
   real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
@@ -121,7 +138,7 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param,
-                            false);
+                            /*useGpu*/ false);
 
   resultGpu = doOneConvTest(/* imgSize */ 4,
                             /* output_x */ 2,
@@ -133,9 +150,42 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param,
-                            true);
+                            /*useGpu*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for conv ///
   input = Matrix::create(1, 3 * 3 * 2, false, false);
   real inputData2[] = {
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
@@ -153,7 +203,7 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param2,
-                            false);
+                            /*useGpu*/ false);
 
   resultGpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
@@ -165,9 +215,10 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param2,
-                            true);
+                            /*useGpu*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 
+  /// TEST3 for conv ///
   real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
 
   resultCpu = doOneConvTest(/* imgSize */ 3,
@@ -180,7 +231,66 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 2,
                             input,
                             param3,
-                            false);
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
 
   resultGpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
@@ -192,7 +302,8 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 2,
                             input,
                             param3,
-                            true);
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 #endif
 }
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 8165eb8269336193858962edac4f9637c2fc1c2f..4f5fdbb37ce024e18b8d39c5dda74c69bf82166a 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -110,6 +110,18 @@ void testEvaluator(TestConfig testConf,
   testEvaluator->finish();
   LOG(INFO) << *testEvaluator;
 
+  std::vector<std::string> names;
+  testEvaluator->getNames(&names);
+  paddle::Error err;
+  for (auto& name : names) {
+    auto value = testEvaluator->getValue(name, &err);
+    ASSERT_TRUE(err.isOK());
+    LOG(INFO) << name << " " << value;
+    auto tp = testEvaluator->getType(name, &err);
+    ASSERT_TRUE(err.isOK());
+    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
+  }
+
   double totalScore2 = 0.0;
   if (testConf.testAccumulate) {
     testEvaluator->start();
@@ -129,6 +141,7 @@ void testEvaluatorAll(TestConfig testConf,
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
+  config.evaluatorConfig.set_top_k(5);
 
   config.inputDefs.push_back({INPUT_DATA, "output", 50});
   config.inputDefs.push_back({INPUT_LABEL, "label", 50});
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 66a70ecd41091b9590038dab3194dd2a0c59dd03..e1e8e7fae7ca4c96206d60703db1f35aa1196875 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -166,15 +166,19 @@ TEST(Projection, scaling) {
   }
 }
 
-void testProjectionConv(size_t groups) {
+void testProjectionConv(size_t groups, bool isDeconv) {
   const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Y = 4;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
 
   ProjectionConfig conf;
-  conf.set_type("conv");
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
   conf.set_num_filters(NUM_FILTERS);
 
   ConvConfig* conv = conf.mutable_conv_conf();
@@ -186,7 +190,11 @@ void testProjectionConv(size_t groups) {
   conv->set_stride(2);
   conv->set_stride_y(2);
   conv->set_groups(groups);
-  conv->set_filter_channels(conv->channels() / conv->groups());
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
                             conv->filter_size(),
@@ -199,8 +207,14 @@ void testProjectionConv(size_t groups) {
                             conv->stride_y(),
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
-  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-  conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  conv->set_output_y(output_y);
+  if (isDeconv) {
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
 
   testProjectionGrad(conf,
                      INPUT_DATA,
@@ -215,8 +229,12 @@ void testProjectionConv(size_t groups) {
 
 #ifndef PADDLE_ONLY_CPU
 TEST(Projection, conv) {
-  testProjectionConv(1);
-  testProjectionConv(3);
+  /// test ConvProjection
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
 }
 #endif
 
@@ -276,27 +294,6 @@ TEST(Layer, AddtoLayer) {
   }
 }
 
-TEST(Layer, CRFLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "crf",
-                100,
-                /* trans */ false,
-                /* useGpu */ false,
-                false /*useWeight*/,
-                0.03 /*epsilon*/);
-}
-
 TEST(Layer, CTCLayer) {
   TestConfig config;
   config.layerConfig.set_type("ctc");
@@ -310,7 +307,11 @@ TEST(Layer, CTCLayer) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */ useGpu);
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
   }
 }
 
@@ -402,11 +403,11 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(4);
   conv->set_channels(16);
   conv->set_padding(0);
   conv->set_padding_y(1);
@@ -433,6 +434,9 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
+#ifndef PADDLE_ONLY_CPU
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
 }
 
 TEST(Layer, blockExpandLayer) {
@@ -587,7 +591,11 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */ false);
+  testLayerGrad(config,
+                "hsigmoid",
+                100,
+                /* trans */ false, /* useGpu */
+                false);
 }
 
 TEST(Layer, multi_cross) {
@@ -796,10 +804,14 @@ TEST(Layer, ExpandLayer) {
   testExpandLayer("seq", true);       // seq expand to hasSubseq
 }
 
-void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
+void testDegradeLayer(bool hasSubseq,
+                      string layer_type,
+                      string trans_type,
+                      int stride) {
   TestConfig config;
   config.layerConfig.set_type(layer_type);
   config.layerConfig.set_size(10);
+  config.layerConfig.set_seq_pool_stride(stride);
   config.biasSize = 0;
 
   config.inputDefs.push_back(
@@ -819,36 +831,46 @@ void testDegradeLayer(bool hasSubseq, string layer_type, string trans_type) {
   if (layer_type == "average") {
     for (auto strategy : {"average", "sum", "squarerootn"}) {
       LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
-                << " average_strategy=" << strategy;
+                << " average_strategy=" << strategy
+                << " seq_pool_stride=" << stride;
       config.layerConfig.set_average_strategy(strategy);
       testDegradeLayerGrad(config, layer_type);
     }
   } else {
-    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type;
+    LOG(INFO) << " hasSubseq=" << hasSubseq << " trans_type=" << trans_type
+              << " seq_pool_stride=" << stride;
     testDegradeLayerGrad(config, layer_type);
   }
 }
 
 TEST(Layer, MaxLayer) {
-  testDegradeLayer(false, "max", "non-seq");  // seq max to non-seq
-  testDegradeLayer(true, "max", "non-seq");   // hasSubseq max to non-seq
-  testDegradeLayer(true, "max", "seq");       // hasSubseq max to seq
+  testDegradeLayer(false, "max", "non-seq", -1);  // seq max to non-seq
+  testDegradeLayer(true, "max", "non-seq", -1);   // hasSubseq max to non-seq
+  testDegradeLayer(true, "max", "seq", -1);       // hasSubseq max to seq
 }
 
 TEST(Layer, SequenceLastInstanceLayer) {
   testDegradeLayer(false,
                    "seqlastins",
-                   "non-seq");  // seq seqlastins to non-seq
+                   "non-seq",
+                   -1);  // seq seqlastins to non-seq
+  testDegradeLayer(false,
+                   "seqlastins",
+                   "non-seq",
+                   5);  // seq seqlastins to a shorten seq, stride window = 5
   testDegradeLayer(true,
                    "seqlastins",
-                   "non-seq");  // hasSubseq seqlastins to non-seq
-  testDegradeLayer(true, "seqlastins", "seq");  // hasSubseq seqlastins to seq
+                   "non-seq",
+                   -1);  // hasSubseq seqlastins to non-seq
+  testDegradeLayer(
+      true, "seqlastins", "seq", -1);  // hasSubseq seqlastins to seq
 }
 
 TEST(Layer, AverageLayer) {
-  testDegradeLayer(false, "average", "non-seq");  // seq average to non-seq
-  testDegradeLayer(true, "average", "non-seq");  // hasSubseq average to non-seq
-  testDegradeLayer(true, "average", "seq");      // hasSubseq average to seq
+  testDegradeLayer(false, "average", "non-seq", -1);  // seq average to non-seq
+  testDegradeLayer(
+      true, "average", "non-seq", -1);           // hasSubseq average to non-seq
+  testDegradeLayer(true, "average", "seq", -1);  // hasSubseq average to seq
 }
 
 TEST(Layer, SequenceConcatLayer) {
@@ -1022,8 +1044,12 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
 }
 
 TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
 }
 
 void setPoolConfig(TestConfig* config,
@@ -1304,6 +1330,25 @@ TEST(Layer, ResizeLayer) {
   }
 }
 
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
+
 TEST(Layer, NCELayer) {
   TestConfig config;
   size_t numClasses = 4;
@@ -1472,16 +1517,20 @@ TEST(Layer, BatchNormalizationLayer) {
 #endif
 }
 
-TEST(Operator, conv) {
+void testConvOperator(bool isDeconv) {
   TestConfig config;
   const int NUM_FILTERS = 16;
   const int FILTER_SIZE = 2;
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 8;
+  const int IMAGE_SIZE_Y = 9;
   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("conv");
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
   ConvConfig* conv = operatorConf.mutable_conv_conf();
   operatorConf.set_num_filters(NUM_FILTERS);
   conv->set_filter_size(FILTER_SIZE);
@@ -1492,7 +1541,6 @@ TEST(Operator, conv) {
   conv->set_stride(2);
   conv->set_stride_y(2);
   conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
   conv->set_img_size_y(IMAGE_SIZE_Y);
   conv->set_output_x(outputSize(conv->img_size(),
@@ -1505,11 +1553,22 @@ TEST(Operator, conv) {
                                 conv->padding_y(),
                                 conv->stride_y(),
                                 /*  caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              NUM_FILTERS);
 
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
+
   config.inputDefs.push_back(
       {INPUT_DATA,
        "layer_1",
@@ -1521,6 +1580,11 @@ TEST(Operator, conv) {
   testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
 }
 
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
+
 TEST(Layer, FeatureMapExpandLayer) {
   TestConfig config;
   config.layerConfig.set_type("featmap_expand");
@@ -1563,6 +1627,84 @@ TEST(Layer, MultiplexLayer) {
   }
 }
 
+TEST(Layer, PadLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("pad");
+
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "pad", 10, false, useGpu);
+  }
+}
+
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+  }
+}
+
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 200, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 200, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false);
+  }
+}
+
+TEST(Layer, TransLayer) {
+  TestConfig config;
+  const int height = 128;
+  const int width = 1028;
+  config.layerConfig.set_type("trans");
+  config.layerConfig.set_size(width);
+
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", /* dim= */ height * width, /* paraSize= */ 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "trans", height, /* trans= */ false, useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index f046cb0b289c9ce22b98f3200bf0a3f7d48d77f5..b37277054c58a5f71cc4649fc6c062ca8dc1d4c9 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) {
   real* a = para.getData();
   real* b = para.getData() + numClasses;
   real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData(), nullptr);
+  LinearChainCRF crf(4, para.getData());
   for (int length : {1, 2, 3, 10}) {
     for (int tries = 0; tries < 10; ++tries) {
       CpuMatrix x(length, numClasses);
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 4db30f37a5bc92d4348caed0aebdd8a589b55712..40e662b22bac0a2d22aea31fe99b11695bac3f57 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -258,12 +258,15 @@ TEST(Compare, img_conv) {
 
 // Test cudnn_conv and exconv give the same result
 TEST(Compare, img_conv2) {
-  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
-  std::string config_file_b = "./gserver/tests/img_conv_c.conf";
+  std::string config_file_a = "./gserver/tests/img_conv_cudnn.py";
+  std::string config_file_b = "./gserver/tests/img_conv_exconv.py";
   bool useGpu = FLAGS_use_gpu;
+  double eps = FLAGS_checkgrad_eps;
   FLAGS_use_gpu = true;
+  FLAGS_checkgrad_eps = 1e-2;
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
+  FLAGS_checkgrad_eps = eps;
 }
 #endif
 
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index 150850da4d49a2320acc70ed370cf8728d5c9def..4a846397e6cf3100f948af46874b0739e32bf4a5 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <paddle/gserver/gradientmachines/GradientMachine.h>
+#include <paddle/parameter/ParameterUpdateFunctions.h>
 #include <paddle/trainer/Trainer.h>
 #include <paddle/trainer/TrainerInternal.h>
 #include <paddle/utils/PythonUtil.h>
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 23ae95852e84216c9065f1b123d35ce868fbb90f..55427e2f12fd7b77c6eea1f65b3229e6fd29d71d 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -148,11 +148,11 @@ LayerPtr createCTCLayer(string name,
 
   ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
 
-  softmaxActivation->forward(dataLayer->getOutput());
+  softmaxActivation->forward(dataLayer->getOutput()).check();
   layer->forward(PASS_GC);
 
   layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput());
+  softmaxActivation->backward(dataLayer->getOutput()).check();
 
   return layer;
 }
diff --git a/paddle/majel/.gitignore b/paddle/majel/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1f5acdebb56971202b63d2485e2ac5042786f13c
--- /dev/null
+++ b/paddle/majel/.gitignore
@@ -0,0 +1,2 @@
+build
+third-party
\ No newline at end of file
diff --git a/paddle/majel/CMakeLists.txt b/paddle/majel/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..baa3bb9e914b3053a18dc638146325ffe3d28a12
--- /dev/null
+++ b/paddle/majel/CMakeLists.txt
@@ -0,0 +1,34 @@
+cmake_minimum_required(VERSION 3.0)
+
+if(GTEST_INCLUDE_DIR AND GTEST_LIBRARIES)
+    message("-- Found gtest (include: ${GTEST_INCLUDE_DIR}, library: ${GTEST_LIBRARIES})")
+else()
+    # find #include <majel/xx.h>
+    get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+    include_directories(${PARENT_DIR})
+
+    # find cmake directory modules
+    get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+    set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+
+    # enable c++11
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
+
+    # enable gtest
+    set(THIRD_PARTY_PATH ${CMAKE_CURRENT_SOURCE_DIR}/third_party)
+    set(WITH_TESTING ON)
+    include(external/gtest)
+endif()
+
+########################### Build Majel #############################
+set(MAJEL_CXX_FILES place.cc)
+set(MAJEL_CUDA_FILES "")
+
+if(CUDA_FOUND)
+    cuda_add_library(majel ${MAJEL_CUDA_FILES} ${MAJEL_CXX_FILES})
+else()
+    add_library(majel ${MAJEL_CXX_FILES})
+endif()
+#####################################################################
+
+add_subdirectory(test)
diff --git a/paddle/majel/README.md b/paddle/majel/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..5539853056797284ca1fa5ef5ab16fa0059907f0
--- /dev/null
+++ b/paddle/majel/README.md
@@ -0,0 +1,126 @@
+# Tensor: An Unified Data Type in PaddlePaddle
+
+## Pain Point
+
+In this week, we discussed several potential weaknesses of PaddlePaddle caused by rapid iteration and development to promote new business products on the line in recent four years. For instance, current Matrix/Vector implementation in PaddlePaddle are long and tedious to read, which interfered seriously with the contribution of both fresh and professional engineers. More seriously for this issue, it will also become too challenging to maintain over time.
+
+
+## Learn from Majel
+
+Consequently, we decide to refactor PaddlePaddle step-by-step. First, refactor and replace Matrix/Vector to Tensor, a modern terminology in the deep learning system. Fortunately, we can learn from Majel how to define a Tensor.
+
+To simplify heterogeneous resource allocation in any dimensions (1-9) and types (double, float, float16), Majel consists of several primitives such as `Dim`, `Place` and `Array`, all of them are standard C++ class templates.
+
+1. `Place`: memory location [i.e. CPU/GPU].
+2. `Allocation`: heterogeneous resource allocator [i.e. 20MB in GPU].
+3. `Dim`: size of each dimension. [i.e. Dim<4>({10, 2, 5, 1})]
+4. `Array`: dynamic array consists of `Place`, `Dim`, and a pointer to memory.
+
+If you dig deeper into Majel source code, you will find Majel heavily use `boost.variant`. The variant class template is a safe, generic, stack-based discriminated union container, **offering a simple solution for manipulating an object from a heterogeneous set of types in a uniform manner**. Whereas standard containers such as std::vector may be thought of as "multi-value, single type," variant is "multi-type, single value."
+
+As a simple example, consider the following:
+
+```c++
+#include "boost/variant.hpp"
+#include <iostream>
+
+class my_visitor : public boost::static_visitor<int>
+{
+public:
+    int operator()(int i) const
+    {
+        return i;
+    }
+    
+    int operator()(const std::string & str) const
+    {
+        return str.length();
+    }
+};
+
+int main()
+{
+    boost::variant< int, std::string > u("hello world");
+    std::cout << u; // output: hello world
+
+    int result = boost::apply_visitor( my_visitor(), u );
+    std::cout << result; // output: 11 (i.e., length of "hello world")
+}
+```
+
+In Majel, `DDimVar` is derived from `Dim`, `DArrayVar` is from `Array`.
+
+```c++
+template<int i>
+struct Dim {
+...    
+int head;
+Dim<i-1> tail;
+}
+```
+
+```c++
+template<typename T, int D>
+class Array : public Buffer {
+    ...
+private:
+    Dim<D> size_;
+    Dim<D> stride_;
+    T* ptr_;
+};
+```
+
+```c++
+typedef boost::variant<GpuPlace, CpuPlace> Place;
+typedef boost::variant<Dim<1>, Dim<2>, Dim<3>, Dim<4>, Dim<5>,
+                       Dim<6>, Dim<7>, Dim<8>, Dim<9>> DDimVar;
+typedef boost::variant<
+    Array<float, 1>,
+    Array<float, 2>,
+    Array<float, 3>,
+    Array<float, 4>,
+
+    Array<double, 1>,
+    Array<double, 2>,
+    Array<double, 3>,
+    Array<double, 4>,
+
+    Array<float16, 1>,
+    Array<float16, 2>,
+    Array<float16, 3>,
+    Array<float16, 4> > DArrayVar;
+```
+
+Because `variant` may be thought of as "multi-type, single value", we can utilize it to implement unified interfaces for PaddlePaddle.
+
+## implement Tensor in Paddle
+
+Before writing code, please make sure you already look through Majel Source Code and grabbed the design philosophy of `DArray` in Majel.
+
+To assign subtasks to our colleagues, we have to discuss how to divide it to independent subtasks.
+
+- [ ] 1. First, we need to consider the third-party dependencies in Majel.
+
+    Majel heavily use `boost.variant`, but we don't want to integrate `boost` into PaddlePaddle. It's better to replace boost using the lightweight implementation. https://github.com/mapbox/variant Mapbox variant has the same speedy performance of `boost::variant `but is faster to compile, results in smaller binaries, and has no dependencies.
+
+> @gangliao
+
+- [ ] 2. Re-implement `Place` and `Allocation/Memory`
+
+    I found @wangkuiyi submitted a pull request includes `Place`. @gangliao and @qijun could re-implement `Allocation`, because we have the GPU development experience before joining Paddle team.
+
+> @wangkuiyi @gangliao @qijun
+
+- [ ] 3. Re-implement `Dim`.
+
+    `Dim` is an excellent implementation in Majel. 
+
+> ???
+
+- [ ] 4. Re-implement `Array/Tensor`.
+
+> Prerequisites: 1 - 3
+
+- [ ] 5. Re-implement fundamental operators for `Array/Tensor`.
+
+> Prerequisites: 1 - 4
diff --git a/paddle/majel/place.cc b/paddle/majel/place.cc
new file mode 100644
index 0000000000000000000000000000000000000000..eecd8e5b730704258d2bd7d98a75a0a80e13a797
--- /dev/null
+++ b/paddle/majel/place.cc
@@ -0,0 +1,49 @@
+#include <majel/place.h>
+
+namespace majel {
+
+namespace detail {
+
+class PlacePrinter : public boost::static_visitor<> {
+private:
+  std::ostream& os_;
+
+public:
+  PlacePrinter(std::ostream& os) : os_(os) {}
+
+  void operator()(const CpuPlace&) { os_ << "CpuPlace"; }
+
+  void operator()(const GpuPlace& p) { os_ << "GpuPlace(" << p.device << ")"; }
+};
+
+}  // namespace majel
+
+static Place the_default_place;
+
+void set_place(const Place& place) { the_default_place = place; }
+
+const Place& get_place() { return the_default_place; }
+
+const GpuPlace default_gpu() { return GpuPlace(0); }
+
+const CpuPlace default_cpu() { return CpuPlace(); }
+
+bool is_gpu_place(const Place& p) {
+  return boost::apply_visitor(IsGpuPlace(), p);
+}
+
+bool is_cpu_place(const Place& p) {
+  return !boost::apply_visitor(IsGpuPlace(), p);
+}
+
+bool places_are_same_class(const Place& p1, const Place& p2) {
+  return is_gpu_place(p1) == is_gpu_place(p2);
+}
+
+std::ostream& operator<<(std::ostream& os, const majel::Place& p) {
+  majel::detail::PlacePrinter printer(os);
+  boost::apply_visitor(printer, p);
+  return os;
+}
+
+}  // namespace majel
diff --git a/paddle/majel/place.h b/paddle/majel/place.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad3dc3fe0b80ac5dc10a59910c580d7912469cd4
--- /dev/null
+++ b/paddle/majel/place.h
@@ -0,0 +1,50 @@
+#pragma once
+#include <boost/variant.hpp>
+#include <iostream>
+
+namespace majel {
+
+struct CpuPlace {
+  CpuPlace() {}  // WORKAROUND: for some reason, omitting this constructor
+                 // causes errors with boost 1.59 and OSX
+  // needed for variant equality comparison
+  inline bool operator==(const CpuPlace&) const { return true; }
+
+  inline bool operator!=(const CpuPlace&) const { return false; }
+};
+
+struct GpuPlace {
+  GpuPlace(int d) : device(d) {}
+
+  // needed for variant equality comparison
+  inline bool operator==(const GpuPlace& o) const { return device == o.device; }
+
+  inline bool operator!=(const GpuPlace& o) const { return !(*this == o); }
+
+  GpuPlace() : GpuPlace(0) {}
+  int device;
+};
+
+class IsGpuPlace : public boost::static_visitor<bool> {
+public:
+  bool operator()(const CpuPlace&) const { return false; }
+
+  bool operator()(const GpuPlace& gpu) const { return true; }
+};
+
+typedef boost::variant<GpuPlace, CpuPlace> Place;
+
+void set_place(const Place&);
+
+const Place& get_place();
+
+const GpuPlace default_gpu();
+const CpuPlace default_cpu();
+
+bool is_gpu_place(const Place&);
+bool is_cpu_place(const Place&);
+bool places_are_same_class(const Place&, const Place&);
+
+std::ostream& operator<<(std::ostream&, const majel::Place&);
+
+}  // namespace majel
diff --git a/paddle/majel/test/CMakeLists.txt b/paddle/majel/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..46da6ff89b4a1d68fe4229b4f0f051000ab390c7
--- /dev/null
+++ b/paddle/majel/test/CMakeLists.txt
@@ -0,0 +1,11 @@
+file(GLOB_RECURSE ALL_TEST_FILES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+
+add_executable(majel_tests ${ALL_TEST_FILES})
+add_dependencies(majel_tests majel)
+target_link_libraries(majel_tests     
+                      ${Boost_LIBRARIES}
+                      ${GTEST_LIBRARIES}
+                      ${GTEST_MAIN_LIBRARIES}
+                      majel
+                     )
+add_test(majel_tests majel_tests)
diff --git a/paddle/majel/test/place_test.cc b/paddle/majel/test/place_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c9a53802b23ef8b225b9e8ef0acfe1b0c5562289
--- /dev/null
+++ b/paddle/majel/test/place_test.cc
@@ -0,0 +1,40 @@
+#include "majel/place.h"
+#include <sstream>
+#include "gtest/gtest.h"
+
+TEST(Place, Equality) {
+  majel::CpuPlace cpu;
+  majel::GpuPlace g0(0), g1(1), gg0(0);
+
+  EXPECT_EQ(cpu, cpu);
+  EXPECT_EQ(g0, g0);
+  EXPECT_EQ(g1, g1);
+  EXPECT_EQ(g0, gg0);
+
+  EXPECT_NE(g0, g1);
+
+  EXPECT_TRUE(majel::places_are_same_class(g0, gg0));
+  EXPECT_FALSE(majel::places_are_same_class(g0, cpu));
+}
+
+TEST(Place, Default) {
+  EXPECT_TRUE(majel::is_gpu_place(majel::get_place()));
+  EXPECT_TRUE(majel::is_gpu_place(majel::default_gpu()));
+  EXPECT_TRUE(majel::is_cpu_place(majel::default_cpu()));
+
+  majel::set_place(majel::CpuPlace());
+  EXPECT_TRUE(majel::is_cpu_place(majel::get_place()));
+}
+
+TEST(Place, Print) {
+  {
+    std::stringstream ss;
+    ss << majel::GpuPlace(1);
+    EXPECT_EQ("GpuPlace(1)", ss.str());
+  }
+  {
+    std::stringstream ss;
+    ss << majel::CpuPlace();
+    EXPECT_EQ("CpuPlace", ss.str());
+  }
+}
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 0a0d92d1ae65f5b6020eb71fe2a6db5a3c625d9c..de48b6fac9c7d8125a552022c52353ef6bcef995 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
               true_type() /* bAsRowVector */, false_type());
 }
 
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8691c87ac3b88499a9676d59af533e0f4713dfc3..6ed48c8d88ee698689de6f7a7f470b97a094ea5b 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
   void mulRowVector(BaseMatrixT& b);
   void divRowVector(BaseMatrixT& b);
 
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
   void addP2P(BaseMatrixT& b);
 
   /**
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 82a482f701481267e564c7ad8179689deb65a75b..bf62229c03bb1d6e2bdf86d8c56a8157938fb832 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -372,7 +372,7 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
 }
 
 /* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   CHECK(!memAlloc);
   CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
   if (format_ == SPARSE_CSR) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index d3e8871cb5b320ce420d601bde7f18d85398dde7..860cad1047fc343b13efa901186ea218d0855151 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -201,7 +201,7 @@ public:
   void zeroMem();
 
   /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
 
   void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
 
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index d7aa1184872d5a6129becca1f6e282776c9dbe15..1a3bb432bfb743fe814fa94c0c104bb6bc598cb8 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -15,6 +15,72 @@ limitations under the License. */
 #include "MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
+#include "paddle/utils/DynamicLoader.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+
+// The argument for stringizing operator is not macro-expanded first.
+// We have to use two levels of macro to do the expansion.
+// See https://gcc.gnu.org/onlinedocs/cpp/Stringizing.html
+#define STR(x) #x
+
+// clang-format off
+#ifndef LAPACK_FOUND
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, STR(__name));                \
+      CHECK(p_##__name) << "Cannot find symbol " << STR(__name)                \
+                        << " in liblapack.so";                                 \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      return __name(args...);                                                  \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+#endif
+
+#ifdef PADDLE_USE_ATLAS
+  #define  PADDLE_SGETRF  clapack_sgetrf
+  #define  PADDLE_DGETRF  clapack_dgetrf
+  #define  PADDLE_SGETRI  clapack_sgetri
+  #define  PADDLE_DGETRI  clapack_dgetri
+#else
+  #define  PADDLE_SGETRF  LAPACKE_sgetrf
+  #define  PADDLE_DGETRF  LAPACKE_dgetrf
+  #define  PADDLE_SGETRI  LAPACKE_sgetri
+  #define  PADDLE_DGETRI  LAPACKE_dgetri
+#endif
+
+#define LAPACK_ROUTINE_EACH(__macro)       \
+  __macro(PADDLE_SGETRF)                   \
+  __macro(PADDLE_DGETRF)                   \
+  __macro(PADDLE_SGETRI)                   \
+  __macro(PADDLE_DGETRI)
+// clang-format on
+
+LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
+
+}  // namespace dynload
 
 namespace paddle {
 
@@ -85,11 +151,7 @@ int getrf<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  int* ipiv) {
-#ifdef PADDLE_USE_ATLAS
-  return clapack_sgetrf(order, M, N, A, lda, ipiv);
-#else
-  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
-#endif
+  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
 }
 
 template <>
@@ -99,11 +161,7 @@ int getrf<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   int* ipiv) {
-#ifdef PADDLE_USE_ATLAS
-  return clapack_dgetrf(order, M, N, A, lda, ipiv);
-#else
-  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
-#endif
+  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
 }
 
 template <>
@@ -112,11 +170,7 @@ int getri<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  const int* ipiv) {
-#ifdef PADDLE_USE_ATLAS
-  return clapack_sgetri(order, N, A, lda, ipiv);
-#else
-  return LAPACKE_sgetri(order, N, A, lda, ipiv);
-#endif
+  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
 }
 
 template <>
@@ -125,11 +179,8 @@ int getri<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   const int* ipiv) {
-#ifdef PADDLE_USE_ATLAS
-  return clapack_dgetri(order, N, A, lda, ipiv);
-#else
-  return LAPACKE_dgetri(order, N, A, lda, ipiv);
-#endif
+  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
+  return 0;
 }
 
 template <>
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index c8559eefd8378450fc18c2ba821c65b39c8cc046..8ada0d34c6733d13a45505492909124010c85a91 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -18,17 +18,32 @@ limitations under the License. */
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>
-#else
-extern "C" {
-#include <cblas.h>
-}
+#endif
+
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
+#include <cblas.h>
 #include <clapack.h>
 }
-#else
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
 #include <lapacke.h>
 #endif
+
+#ifndef LAPACK_FOUND
+extern "C" {
+#include <cblas.h>
+int LAPACKE_sgetrf(
+    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
+int LAPACKE_dgetrf(
+    int matrix_layout, int m, int n, double* a, int lda, int* ipiv);
+int LAPACKE_sgetri(
+    int matrix_layout, int n, float* a, int lda, const int* ipiv);
+int LAPACKE_dgetri(
+    int matrix_layout, int n, double* a, int lda, const int* ipiv);
+}
 #endif
 
 #include <cmath>
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 90813a89969c2525f7029f1c2609bed116c910c4..6ac61be0bf1b7a4e308705617faf5af2886a4082 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -274,6 +274,18 @@ real GpuMatrix::getSum() {
   return sum;
 }
 
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
+
 void GpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
@@ -371,11 +383,13 @@ MatrixPtr GpuMatrix::getTranspose() {
   }
 }
 
-void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   if (memAlloc) {
     matTrans = std::make_shared<GpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -385,13 +399,27 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
+
 MatrixPtr GpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
   return matInv;
 }
 
-void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(height_, width_);
 
   if (memAlloc) {
@@ -455,6 +483,20 @@ void GpuMatrix::sequenceAvgForward(Matrix& a,
   hl_sequence_avg_forward(dst, src, starts, height, width, mode);
 }
 
+void GpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+
+  hl_sequence_avg_backward(dst, src, starts, height, width, mode);
+}
+
 /* this = scaleAB*(a*b) +  scaleT*this */
 void GpuMatrix::mul(const GpuMatrix& a,
                     const GpuMatrix& b,
@@ -704,6 +746,7 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   hl_matrix_top_k(maxVal.getData(),
                   maxVal.getStride(),
@@ -764,19 +807,32 @@ void GpuMatrix::maxoutBackward(Matrix& a,
 }
 
 /*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output, IVector& label) {
-  auto output_ptr = dynamic_cast<const GpuMatrix*>(&output);
-  auto label_ptr = dynamic_cast<const GpuIVector*>(&label);
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == 1)
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
       << "Matrix dimensions are not equal";
 
-  hl_matrix_classification_error((real*)output_ptr->data_,
-                                 (int*)label_ptr->getData(),
-                                 data_,
-                                 height_,
-                                 output_ptr->width_);
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -913,59 +969,6 @@ void GpuMatrix::softreluDerivative(Matrix& output) {
 void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
   BaseMatrix::scaledTanh(output, p1, p2);
 }
-void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-      << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-  real* out = getData();
-  real* x = output1.getData();
-  real* y = output2.getData();
-  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
-}
-void GpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
-        prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true)
-      << "Matrix type are not equal";
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  real* grad = getData();
-  real* out = output.getData();
-  real* prevOutX = prevOut1.getData();
-  real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad,
-                       out,
-                       prevOutX,
-                       prevOutY,
-                       prevGradX,
-                       prevGradY,
-                       dim,
-                       prevOut1.getHeight(),
-                       prevOut2.getHeight(),
-                       scale);
-}
 
 void GpuMatrix::randomizeUniform() {
   CHECK(isContiguous());
@@ -1311,7 +1314,9 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   real* output = getData();
   hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
@@ -1324,7 +1329,9 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_w(
       wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
@@ -1336,7 +1343,9 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_diff(
       ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
@@ -1684,11 +1693,13 @@ MatrixPtr CpuMatrix::getTranspose() {
   }
 }
 
-void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   if (memAlloc) {
     matTrans = std::make_shared<CpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -1702,13 +1713,35 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
+
 MatrixPtr CpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
   return matInv;
 }
 
-void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(height_, width_);
 
   if (memAlloc) {
@@ -2285,6 +2318,41 @@ void CpuMatrix::sequenceAvgForward(Matrix& a,
   }
 }
 
+void CpuMatrix::sequenceAvgBackward(Matrix& a,
+                                    const IVector& startsPos,
+                                    int mode) {
+  size_t height = a.getHeight();
+  size_t width = getWidth();
+  CHECK_EQ(height, startsPos.getSize() - 1);
+  CHECK_EQ(width, a.getWidth());
+  real* dst = getData();
+  real* src = a.getData();
+  const int* starts = startsPos.getData();
+  MatrixPtr outMtx = Matrix::create(nullptr, 1, width, false, false);
+  MatrixPtr dataMtx = Matrix::create(nullptr, 1, width, false, false);
+  for (size_t i = 0; i < height; ++i) {
+    int sequenceLength = starts[i + 1] - starts[i];
+    if (0 == sequenceLength) {
+      // empty sequence
+      continue;
+    }
+    outMtx->setData(dst + starts[i] * width, sequenceLength, width);
+    dataMtx->setData(src + i * width);
+    if (mode == 0) {
+      // plain average
+      outMtx->addBias(*dataMtx, 1.0f / sequenceLength);
+    } else if (mode == 1) {
+      // sum instead of average
+      outMtx->addBias(*dataMtx, 1.0f);
+    } else if (mode == 2) {
+      // divide by square root of sequenceLength
+      outMtx->addBias(*dataMtx, 1.0f / std::sqrt(sequenceLength));
+    } else {
+      LOG(FATAL) << "should not reach here";
+    }
+  }
+}
+
 /* this = scaleAB*(a*b) + scaleT*this*/
 void CpuMatrix::mul(const Matrix& a,
                     const Matrix& b,
@@ -2358,41 +2426,8 @@ void CpuMatrix::mul(CpuMatrix* a, CpuMatrix* b, real scaleAB, real scaleT) {
   int lda = a->getStride();
   int ldb = b->getStride();
   int ldc = getStride();
-#ifndef PADDLE_TYPE_DOUBLE
-  cblas_sgemm(CblasRowMajor,
-              a_trans,
-              b_trans,
-              M,
-              N,
-              K,
-              scaleAB,
-              A,
-              lda,
-              B,
-              ldb,
-              scaleT,
-              C,
-              ldc);
-#else
-  cblas_dgemm(CblasRowMajor,
-              a_trans,
-              b_trans,
-              M,
-              N,
-              K,
-              scaleAB,
-              A,
-              lda,
-              B,
-              ldb,
-              scaleT,
-              C,
-              ldc);
-// TODO(yuyang18): Is gemm defined other place?
-#endif
-
-  VLOG(2) << " A[0]=" << A[0] << " A[1]=" << A[1] << " B[0]=" << B[0]
-          << " B[1]=" << B[1] << " C[0]=" << C[0] << " C[1]=" << C[1];
+  gemm<real>(
+      a_trans, b_trans, M, N, K, scaleAB, A, lda, B, ldb, scaleT, C, ldc);
 }
 
 void CpuMatrix::mul(
@@ -3034,7 +3069,7 @@ void CpuMatrix::rowMax(Matrix& max) {
   max.maxRows(*this);
 }
 
-/* get beam size of max ids and values */
+/* Get the top k elements of each row of this matrix */
 void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK(isContiguous());
   CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
@@ -3042,6 +3077,7 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   real* a = getData();
   int* s = maxIds.getData();
@@ -3193,32 +3229,39 @@ void CpuMatrix::rowNormalizeL1(Matrix& out) {
 }
 
 /* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&output));
-  CHECK(dynamic_cast<const CpuIVector*>(&label));
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
 
-  CHECK_EQ(getWidth(), (size_t)1);
-  size_t numSamples = getHeight();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
 
-  size_t dim = output.getWidth();
-  real* out = output.getData();
-  int* lbl = label.getData();
-  real maxData = 0.0;
-  int maxIndex = -1;
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
   for (size_t i = 0; i < numSamples; ++i) {
     CHECK_GE(lbl[i], 0);
     CHECK_LT((size_t)lbl[i], dim);
-    maxData = out[i * dim];
-    maxIndex = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (maxData < out[i * dim + j]) {
-        maxIndex = j;
-        maxData = out[i * dim + j];
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
       }
+      result[i] = 1.0f;
     }
-    getData()[i] = (maxIndex != lbl[i]);
   }
 }
 
@@ -3412,105 +3455,6 @@ void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
   }
 }
 
-void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-
-  real* out = getData();
-  const real* x = output1.getData();
-  const real* y = output2.getData();
-  size_t yInc = dim;
-  if (output2.getHeight() == 1LU) {
-    yInc = 0;
-  } else {
-    CHECK_EQ(output2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i, x += dim, y += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(x[j]);
-      squareSumY += _square(y[j]);
-      xy += x[j] * y[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    out[i] = scale * xy / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-  }
-}
-
-void CpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  const real* grad = getData();
-  const real* out = output.getData();
-  const real* prevOutX = prevOut1.getData();
-  const real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  size_t yInc = dim;
-  if (prevOut2.getHeight() == 1LU) {
-    yInc = 0;
-    CHECK_EQ(prevGrad2.getHeight(), 1LU);
-  } else {
-    CHECK_EQ(prevOut2.getHeight(), numSamples);
-    CHECK_EQ(prevGrad2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i,
-              prevOutX += dim,
-              prevOutY += yInc,
-              prevGradX += dim,
-              prevGradY += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(prevOutX[j]);
-      squareSumY += _square(prevOutY[j]);
-      xy += prevOutX[j] * prevOutY[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    if (xy == 0) {
-      real reciprocal = 1.0f / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += scale * grad[i] * prevOutY[j] * reciprocal;
-        prevGradY[j] += scale * grad[i] * prevOutX[j] * reciprocal;
-      }
-    } else {
-      real reciprocalXY = 1.0f / xy;
-      real reciprocalSquareSumX = 1.0f / squareSumX;
-      real reciprocalSquareSumY = 1.0f / squareSumY;
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += out[i] * grad[i] * (prevOutY[j] * reciprocalXY -
-                                            prevOutX[j] * reciprocalSquareSumX);
-        prevGradY[j] += out[i] * grad[i] * (prevOutX[j] * reciprocalXY -
-                                            prevOutY[j] * reciprocalSquareSumY);
-      }
-    }
-  }
-}
-
 void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
   CHECK(output.useGpu_ == false && label.useGpu_ == false)
       << "Matrix type are not equal";
@@ -3662,6 +3606,59 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
   }
 }
 
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+
+  real* cost = getData();
+  real* out = output.getData();
+  real* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real absVal = std::fabs(out[j] - lbl[j]);
+      if (absVal < 1.0)
+        cost[i] += 0.5 * absVal * absVal;
+      else
+        cost[i] += absVal - 0.5;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), dim);
+
+  real* out = output.getData();
+  real* lbl = label.getData();
+  real* grad = getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      real val = out[j] - lbl[j];
+      if (std::fabs(val) < 1) {
+        grad[j] += val;
+      } else {
+        grad[j] += (real(0) < val) - (val < real(0));
+      }
+    }
+  }
+}
+
 void CpuMatrix::tanh(Matrix& output) {
   CHECK(isContiguous());
   CHECK(output.isContiguous());
@@ -3764,7 +3761,9 @@ void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
@@ -3778,7 +3777,9 @@ void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
@@ -3793,7 +3794,9 @@ void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index ceac0212d25a53ca77403b57aa66d2607ed41c5a..3252adb19e4c2e48f86c3c811bfc7d75fd06a8f7 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -31,6 +31,7 @@ limitations under the License. */
 
 namespace paddle {
 
+/// TODO(tianbing), move to paddle/function/TensorType.h
 enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
 
 /**
@@ -56,6 +57,7 @@ enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
  *            value [1, 1, 2, 2, 5]
  * @endcode
  */
+/// TODO(tianbing), move to paddle/function/TensorType.h
 enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
 
 class Matrix;
@@ -370,7 +372,27 @@ public:
    * allocate matTrans' memory outside, then set memAlloc as false;
    * else set as true.
    */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -385,7 +407,7 @@ public:
    * if allocate matInv's memory outside, then set memAlloc as false;
    * else set as true.
    */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -439,6 +461,12 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void sequenceAvgBackward(Matrix& a,
+                                   const IVector& startsPos,
+                                   int mode) {
+    LOG(FATAL) << "Not implemented";
+  }
+
   /**
    * @code
    * this = scaleAB*(a*b) + scaleT*this
@@ -761,6 +789,14 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void smoothL1(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
   virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
 
   virtual void tanhDerivative(Matrix& output) {
@@ -777,26 +813,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
   /// print out the values of elements to os
   virtual void print(std::ostream& os) const {
     LOG(FATAL) << "Not implemented";
@@ -834,8 +850,11 @@ public:
    * output[i] = 1 if row i is an error.
    *
    * output[i] = 0 if row i is correct.
+   *
    */
-  virtual void classificationError(Matrix& output, IVector& label) {
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -1091,6 +1110,10 @@ public:
       TensorCpuApply<real>(*this, expr);
     }
   }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1163,11 +1186,15 @@ public:
   void accumulateColSum(Matrix& src);
   real getAbsSum();
 
+  real getMin();
+  real getMax();
+
   MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
 
   MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
 
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
@@ -1182,6 +1209,7 @@ public:
   void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
 
   /**
    * @code
@@ -1294,14 +1322,6 @@ public:
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
 
@@ -1312,7 +1332,7 @@ public:
   void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void convExpand(Matrix& feature,
                   int feaImgHeight,
@@ -1479,10 +1499,11 @@ public:
   real getAbsSum();
 
   MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
 
   MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
 
   void copyFrom(const Matrix& src);
 
@@ -1605,6 +1626,7 @@ public:
   void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
+  void sequenceAvgBackward(Matrix& a, const IVector& startsPos, int mode);
 
   /**
    * @code
@@ -1714,6 +1736,9 @@ public:
   /// gradient of sumOfSquares.
   void sumOfSquaresBp(Matrix& outputV, Matrix& label);
 
+  void smoothL1(Matrix& output, Matrix& label);
+  void smoothL1Bp(Matrix& output, Matrix& label);
+
   void tanh(Matrix& output);
   void tanhDerivative(Matrix& output);
 
@@ -1721,14 +1746,6 @@ public:
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
   void printOneRow(std::ostream& os, size_t idx) const;
@@ -1744,7 +1761,7 @@ public:
 
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbb829c4e24a659e4a97c0a3ba4c5c78b68815d3
--- /dev/null
+++ b/paddle/math/RowBuffer.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return const_cast<real*>(rowStore_.data() + row * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() { rowStore_.clear(); }
+
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
+
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+
+private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.cpp b/paddle/math/SIMDFunctions.cpp
index 95219debf50e57407b668d315b91141d259fc779..d66d543a61450b47b7758b50eaecc107c6fe3576 100644
--- a/paddle/math/SIMDFunctions.cpp
+++ b/paddle/math/SIMDFunctions.cpp
@@ -13,119 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SIMDFunctions.h"
+#ifdef __SSE3__
 #include <immintrin.h>
+#endif
 #include <algorithm>
 
-#ifndef __AVX__
-static void addto_sse(float* a, const float* b, size_t len) {
-  int offset = len % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    mb0 = _mm_load_ps(b);
-    mb1 = _mm_load_ps(b + 4);
-    mb2 = _mm_load_ps(b + 8);
-    mb3 = _mm_load_ps(b + 12);
-
-    ma0 = _mm_add_ps(ma0, mb0);
-    ma1 = _mm_add_ps(ma1, mb1);
-    ma2 = _mm_add_ps(ma2, mb2);
-    ma3 = _mm_add_ps(ma3, mb3);
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) a[i] += b[i];
-}
-
-static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
-  int offset = len % 16;
-
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-
-  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
-    ma0 = _mm_load_ps(a);
-    ma1 = _mm_load_ps(a + 4);
-    ma2 = _mm_load_ps(a + 8);
-    ma3 = _mm_load_ps(a + 12);
-
-    for (int i = 0; i < batch; i++) {
-      mb0 = _mm_load_ps(b[i]);
-      mb1 = _mm_load_ps(b[i] + 4);
-      mb2 = _mm_load_ps(b[i] + 8);
-      mb3 = _mm_load_ps(b[i] + 12);
-      ma0 = _mm_add_ps(ma0, mb0);
-      ma1 = _mm_add_ps(ma1, mb1);
-      ma2 = _mm_add_ps(ma2, mb2);
-      ma3 = _mm_add_ps(ma3, mb3);
-      b[i] += 16;
-    }
-
-    _mm_store_ps(a, ma0);
-    _mm_store_ps(a + 4, ma1);
-    _mm_store_ps(a + 8, ma2);
-    _mm_store_ps(a + 12, ma3);
-  }
-
-  for (int i = 0; i < offset; i++) {
-    for (int k = 0; k < batch; k++) a[i] += b[k][i];
-  }
-  return;
-}
-
-static void col_max_sse(float* result,
-                        const float* data,
-                        int dim,
-                        int numSamples) {
-  // first sample, direct copy
-  for (int d = 0; d < dim; ++d) {
-    result[d] = data[d];
-  }
-  int offset = dim % 16;
-  __m128 ma0, ma1, ma2, ma3;
-  __m128 mb0, mb1, mb2, mb3;
-  // first 16n dims
-  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
-    ma0 = _mm_load_ps(result);
-    ma1 = _mm_load_ps(result + 4);
-    ma2 = _mm_load_ps(result + 8);
-    ma3 = _mm_load_ps(result + 12);
-    for (int i = 1; i < numSamples; i++) {
-      mb0 = _mm_load_ps(data + i * dim);
-      mb1 = _mm_load_ps(data + i * dim + 4);
-      mb2 = _mm_load_ps(data + i * dim + 8);
-      mb3 = _mm_load_ps(data + i * dim + 12);
-      ma0 = _mm_max_ps(ma0, mb0);
-      ma1 = _mm_max_ps(ma1, mb1);
-      ma2 = _mm_max_ps(ma2, mb2);
-      ma3 = _mm_max_ps(ma3, mb3);
-    }
-    _mm_store_ps(result, ma0);
-    _mm_store_ps(result + 4, ma1);
-    _mm_store_ps(result + 8, ma2);
-    _mm_store_ps(result + 12, ma3);
-  }
-  // last dims
-  for (int d = 0; d < offset; ++d) {
-    float sm = data[d];
-    for (int i = 1; i < numSamples; ++i) {
-      sm = std::max(sm, data[i * dim + d]);
-    }
-    result[d] = sm;
-  }
-}
-
-#else
+#ifdef __AVX__
 static void addto_avx(float* a, const float* b, size_t len) {
   int offset = len % 32;
 
@@ -355,17 +248,128 @@ static void decayL1_avx(
   }
 }
 
+#elif defined(__SSE3__)
+
+static void addto_sse(float* a, const float* b, size_t len) {
+  int offset = len % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16, b += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    mb0 = _mm_load_ps(b);
+    mb1 = _mm_load_ps(b + 4);
+    mb2 = _mm_load_ps(b + 8);
+    mb3 = _mm_load_ps(b + 12);
+
+    ma0 = _mm_add_ps(ma0, mb0);
+    ma1 = _mm_add_ps(ma1, mb1);
+    ma2 = _mm_add_ps(ma2, mb2);
+    ma3 = _mm_add_ps(ma3, mb3);
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) a[i] += b[i];
+}
+
+static void batch_addto_sse(float* a, const float* b[], int batch, size_t len) {
+  int offset = len % 16;
+
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+
+  for (unsigned int k = 0; k < len / 16; k++, a += 16) {
+    ma0 = _mm_load_ps(a);
+    ma1 = _mm_load_ps(a + 4);
+    ma2 = _mm_load_ps(a + 8);
+    ma3 = _mm_load_ps(a + 12);
+
+    for (int i = 0; i < batch; i++) {
+      mb0 = _mm_load_ps(b[i]);
+      mb1 = _mm_load_ps(b[i] + 4);
+      mb2 = _mm_load_ps(b[i] + 8);
+      mb3 = _mm_load_ps(b[i] + 12);
+      ma0 = _mm_add_ps(ma0, mb0);
+      ma1 = _mm_add_ps(ma1, mb1);
+      ma2 = _mm_add_ps(ma2, mb2);
+      ma3 = _mm_add_ps(ma3, mb3);
+      b[i] += 16;
+    }
+
+    _mm_store_ps(a, ma0);
+    _mm_store_ps(a + 4, ma1);
+    _mm_store_ps(a + 8, ma2);
+    _mm_store_ps(a + 12, ma3);
+  }
+
+  for (int i = 0; i < offset; i++) {
+    for (int k = 0; k < batch; k++) a[i] += b[k][i];
+  }
+  return;
+}
+
+static void col_max_sse(float* result,
+                        const float* data,
+                        int dim,
+                        int numSamples) {
+  // first sample, direct copy
+  for (int d = 0; d < dim; ++d) {
+    result[d] = data[d];
+  }
+  int offset = dim % 16;
+  __m128 ma0, ma1, ma2, ma3;
+  __m128 mb0, mb1, mb2, mb3;
+  // first 16n dims
+  for (int k = 0; k < dim / 16; k++, result += 16, data += 16) {
+    ma0 = _mm_load_ps(result);
+    ma1 = _mm_load_ps(result + 4);
+    ma2 = _mm_load_ps(result + 8);
+    ma3 = _mm_load_ps(result + 12);
+    for (int i = 1; i < numSamples; i++) {
+      mb0 = _mm_load_ps(data + i * dim);
+      mb1 = _mm_load_ps(data + i * dim + 4);
+      mb2 = _mm_load_ps(data + i * dim + 8);
+      mb3 = _mm_load_ps(data + i * dim + 12);
+      ma0 = _mm_max_ps(ma0, mb0);
+      ma1 = _mm_max_ps(ma1, mb1);
+      ma2 = _mm_max_ps(ma2, mb2);
+      ma3 = _mm_max_ps(ma3, mb3);
+    }
+    _mm_store_ps(result, ma0);
+    _mm_store_ps(result + 4, ma1);
+    _mm_store_ps(result + 8, ma2);
+    _mm_store_ps(result + 12, ma3);
+  }
+  // last dims
+  for (int d = 0; d < offset; ++d) {
+    float sm = data[d];
+    for (int i = 1; i < numSamples; ++i) {
+      sm = std::max(sm, data[i * dim + d]);
+    }
+    result[d] = sm;
+  }
+}
+
 #endif
 
-#ifndef __AVX__
-#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
-#else
+#if defined(__AVX__)
 #define SIMD_INVOKE(func, ...) func##_avx(__VA_ARGS__)
+#elif defined(__SSE3__)
+#define SIMD_INVOKE(func, ...) func##_sse(__VA_ARGS__)
 #endif
 
 namespace paddle {
 namespace simd {
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len) {
   SIMD_INVOKE(addto, a, b, len);
 }
@@ -376,6 +380,7 @@ void batchAddToImpl(float* a, const float* b[], int batch, size_t len) {
 void colMaxImpl(float* result, const float* data, int dim, int numSamples) {
   SIMD_INVOKE(col_max, result, data, dim, numSamples);
 }
+#endif
 
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len) {
@@ -385,8 +390,8 @@ void decayL1AvxImpl(
     float* dst, float* src, float* lr, float lambda, size_t len) {
   decayL1_avx(dst, src, lr, lambda, len);
 }
-
 #endif
+
 }  // namespace internal
 }  // namespace simd
 }  // namespace paddle
diff --git a/paddle/math/SIMDFunctions.h b/paddle/math/SIMDFunctions.h
index 9b0a8719b287a2b88e966484090974586d64521f..439f11b79d134d7054f45f2d0a70fc5a6fde6c13 100644
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -128,17 +128,29 @@ void decayL1AvxImpl(
 
 template <>
 inline void addTo(float* a, const float* b, size_t len) {
+#ifdef __SSE3__
   internal::addToImpl(a, b, len);
+#else
+  naive::addTo(a, b, len);
+#endif
 }
 
 template <>
 inline void batchAddTo(float* a, const float* b[], int batch, size_t len) {
+#ifdef __SSE3__
   internal::batchAddToImpl(a, b, batch, len);
+#else
+  naive::batchAddTo(a, b, batch, len);
+#endif
 }
 
 template <>
 inline void colMax(float* result, const float* data, int dim, int numSamples) {
+#ifdef __SSE3__
   internal::colMaxImpl(result, data, dim, numSamples);
+#else
+  naive::colMax(result, data, dim, numSamples);
+#endif
 }
 
 template <>
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 720a035ecbd26df01fe24c991982bbf7965ccbdc..6370c77386688a334fa0de8b4e2b272882e9e2b0 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -177,7 +177,6 @@ GpuSparseMatrix::GpuSparseMatrix(real* value,
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
-    LOG(INFO) << "weight to matrix ";
   }
 }
 
@@ -498,7 +497,7 @@ void GpuSparseMatrix::setRow(size_t row,
 
 SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
 
-void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   CHECK_EQ(format_, SPARSE_CSC);
   int nnz = sMatrix_->nnz;
   if (memAlloc) {
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 1d3801548e03a6ae679afb15bf7f620172d61c57..f6cd5df338965b55ca17636de097d2401dc057f9 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -109,7 +109,7 @@ public:
   MatrixPtr getTranspose();
 
   /// B = A'
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
 
   void copyFrom(const Matrix& src);
   void copyFrom(const Matrix& src, hl_stream_t stream);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index b61c6b2d49ccead5e9cfdf595a8bebae0e5b87b5..b8c781ca1fd46c9840817abe26a20eec005c37e9 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-DEFINE_bool(allow_inefficient_sparse_update,
-            false,
-            "Whether to allow inefficient sparse update");
-
 namespace paddle {
 
 const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 778a9bd845661849261b52dcbeb519809d0c6306..1ccbf97b25922ae52377d7048da3a07012d21003 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -18,10 +18,9 @@ limitations under the License. */
 #include <string.h>
 #include <algorithm>
 #include "Matrix.h"
+#include "RowBuffer.h"
 #include "paddle/utils/Util.h"
 
-DECLARE_bool(allow_inefficient_sparse_update);
-
 namespace paddle {
 
 /**
@@ -45,12 +44,9 @@ public:
                      IndexDictPtr indexDictHandle = nullptr,
                      bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
-        storeMat_(dataHandle,
-                  dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width,
-                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
   }
 
   virtual ~SparseRowCpuMatrix() {}
@@ -71,25 +67,16 @@ public:
    *
    *  @param row row id in local storage
    */
-  real* getLocalRow(size_t row) {
-    if (storeMat_.getData()) return storeMat_.rowBuf(row);
-    if (rowStore_.size() <= row * width_) {
-      rowStore_.resize((row + 1) * width_);
-    }
-    return rowStore_.data() + row * width_;
-  }
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
 
   /**
-   *  reserve the storage for rows according to current size of indexDictHandle.
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
    *
    *  This is only used when SparseRowCpuMatrix is constructed with
    *  indexDictHandle.
    */
-  void reserveStore() {
-    if (!storeMat_.getData() && !localIndices_->empty()) {
-      rowStore_.resize(localIndices_->size() * width_);
-    }
-  }
+  void reserveStore() { buf_->resize(localIndices_->size()); }
 
   // row is the row id in the original matrix
   virtual real* getRowBuf(size_t row) { return getRow(row); }
@@ -117,7 +104,8 @@ public:
    *
    * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
    *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter matrix,
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
    * store the time that each weight row last updated.
    *
    * Time is batchId, currentTime is current batchId.
@@ -176,8 +164,7 @@ public:
 protected:
   template <typename Func>
   void apply(Func f) {
-    real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
-    f(data, localIndices_->size() * width_);
+    f(buf_->data(), localIndices_->size() * width_);
   }
 
   void init(size_t height, size_t width);
@@ -188,25 +175,23 @@ protected:
       globalIndices_[id] = kUnusedId_;
     }
     localIndices_->clear();
-    rowStore_.clear();
+    buf_->clear();
   }
 
   inline void checkStoreSize() {
-    if (storeMat_.getData()) {
-      CHECK_LE(localIndices_->size(), storeMat_.getHeight());
-    } else if (!FLAGS_allow_inefficient_sparse_update) {
-      if (localIndices_->size() > 0.5 * height_) {
-        LOG(WARNING)
-            << "There are more than 0.5*height (" << localIndices_->size()
-            << ") rows are used for sparse "
-            << "update, which is not efficient. Considering not use "
-            << "sparse_update or set --allow_inefficient_sparse_update=true";
+    if (buf_->isAutoGrowth()) {
+      if (buf_->getRowCount() > 0.5 * height_) {
+        LOG(WARNING) << "There are more than 0.5*height ("
+                     << localIndices_->size() << ") rows are used for sparse "
+                     << "update, which is not efficient. Considering not use "
+                     << "sparse_update.";
       }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
     }
   }
 
-  CpuMatrix storeMat_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  std::unique_ptr<RowBuffer> buf_;
   IndexDictPtr indexDictHandle_;
   std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
   unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 56e5442394b04230c22d668aa734dc0fa44004c2..7ce17a3207becb176a852a16fca52376009db9ee 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Storage.h"
 #include "Allocator.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 DEFINE_int32(pool_limit_size,
@@ -62,7 +63,7 @@ PoolAllocator* StorageEngine::getGpuAllocator(int deviceId) {
     }
     if (gpuAllocator_[deviceId] == nullptr) {
       std::string name =
-          "gpu" + std::to_string(deviceId) + std::string("_pool");
+          "gpu" + str::to_string(deviceId) + std::string("_pool");
       gpuAllocator_[deviceId] =
           new PoolAllocator(new GpuAllocator(), FLAGS_pool_limit_size, name);
     }
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index 06fc10bae7232fb1278e89e8d9cbdf477fc27b60..ceb96b2e250d8e04ffb2b1d8c77ad498dca91cf3 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
+add_simple_unittest(test_RowBuffer)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index c3020961880484a7944f8cc61377a4f08122e403..713f407f496099c04e5834b2bdcf7b1cf5a86a3a 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -37,7 +37,7 @@ limitations under the License. */
  *
  *  AutoCompare test;
  *  test.cmpWithoutArg<I...>(function, height, width)
-*/
+ */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 21918b86e1ad98766ceaf09dea3020d6e8592191..22ce39701fca7b650fc03794cb0701e0987d2dae 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
       compare(&BaseMatrix::addRowVector);
       compare(&BaseMatrix::mulRowVector);
       compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
       compare(&BaseMatrix::addP2P);
       compare(&BaseMatrix::invSqrt);
     }
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index 6899769144dd89156b2ffdb644c47ef0025d624b..1c21da5b76e95603258a5006d0c57b00126e65b9 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -181,28 +181,6 @@ TEST(Matrix, copyByRowIndex) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  AutoCompare test(heightX, 1);
-  CpuMatrix arg1(heightX, width);
-  CpuMatrix arg2(heightY, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluForward(int height, int width, int w_height, int w_width) {
   AutoCompare test(height, width);
   CpuMatrix arg1(height, width);
@@ -224,10 +202,11 @@ void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
 }
 
 TEST(Matrix, paramRelu) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluForward(height, width, w_height, w_width);
           testParamReluBackwardW(height, width, w_height, w_width);
         }
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8cc4c69a1a4d8afec08bf7fb13408e135a06c09c
--- /dev/null
+++ b/paddle/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128UL, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 9d3fbaef43d719d07577631d5df3ac4656610cc6..c0572dfdbf738a4dfad04811b3a3e1b65487ff6d 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -248,11 +248,13 @@ TEST(Matrix, SparseMatrixTranspose) {
             /*dense matrix transpose*/
             CpuMatrixPtr matC(new CpuMatrix(height, width));
             matC->copyFrom(*matA);
-            CpuMatrixPtr matD(new CpuMatrix(width, height));
+            MatrixPtr matD(new CpuMatrix(width, height));
             matC->transpose(matD, false);
+
             /*check result*/
             checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB), matD);
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
+                std::dynamic_pointer_cast<CpuMatrix>(matD));
           }
         }
       }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 3a780d26c050ac5870824f2ef35c87edc61900a2..5a0dffe086c4e265d17c79dba435b66c0873e3c7 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -105,6 +106,21 @@ void testMatrixGetSum(int height, int width) {
   EXPECT_LE(fabs(cpuSum - gpuSum), err);
 }
 
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
+
 void testMatrixZeroAtOffset(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
@@ -161,11 +177,29 @@ void testMatrixTranspose(int height, int width) {
   cpu->randomizeUniform();
   gpu->copyFrom(*cpu);
   cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, false);
+  gpu->transpose(gpuT, true);
 
   TensorCheckEqual(*cpuT, *gpuT);
 }
 
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
 void testMatrixInverse(int height) {
   MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
   MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
@@ -181,7 +215,7 @@ void testMatrixInverse(int height) {
   cpu->add(*outputCheck);
 
   gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, false);
+  cpu->inverse(cpuI, true);
   gpu->inverse(gpuI, false);
 
   TensorCheckErr(*cpuI, *gpuI);
@@ -200,9 +234,21 @@ TEST(Matrix, unary) {
       testMatrixZeroAtOffset(height, width);
       testMatrixGetSum(height, width);
       testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
     }
-    // inverse
+#ifdef LAPACK_FOUND
+    // inverse matrix
     testMatrixInverse(height);
+#else
+    LOG(WARNING) << "Cannot run Matrix Inverse Unit Test.\n"
+                 << "Failed to find lapack library in current system.\n"
+                 << "To address this issue, Please adopt one of the following "
+                    "approaches: \n"
+                 << "1. Simply issue `sudo apt-get install liblapacke-dev` to "
+                    "avoid re-build source code. \n"
+                 << "2. Install MKL/Openblas/ATLAS and re-build PaddlePaddle "
+                    "source code.";
+#endif
   }
 }
 
@@ -651,7 +697,7 @@ TEST(SMatrix, topK) {
   }
 }
 
-void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
+void testMatrixSequenceAvg(int batchSize, int inputDim, int mode) {
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
   MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
   cpuInput->randomizeUniform();
@@ -672,70 +718,25 @@ void testMatrixSequenceAvgForward(int batchSize, int inputDim, int mode) {
   gpuOutput->sequenceAvgForward(*gpuInput, *gpuSequence, mode);
 
   TensorCheckErr(*cpuOutput, *gpuOutput);
+
+  MatrixPtr cpuInGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
+  MatrixPtr gpuInGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
+  cpuInGrad->randomizeUniform();
+  gpuInGrad->copyFrom(*cpuInGrad);
+
+  cpuInGrad->sequenceAvgBackward(*cpuOutput, *cpuSequence, mode);
+  gpuInGrad->sequenceAvgBackward(*gpuOutput, *gpuSequence, mode);
+
+  TensorCheckErr(*cpuInGrad, *gpuInGrad);
 }
 
-TEST(Matrix, sequenceAvgForward) {
+TEST(Matrix, sequenceAvg) {
   for (auto batchSize : {10, 128, 6000}) {
     for (auto inputDim : {32, 100, 512}) {
       for (auto mode : {0, 1, 2}) {
         VLOG(3) << " batchSize=" << batchSize << " inputDim=" << inputDim
                 << " mode=" << mode;
-        testMatrixSequenceAvgForward(batchSize, inputDim, mode);
-      }
-    }
-  }
-}
-
-void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr prevGradX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradY = CpuMatrix::create(heightY, width, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  grad->randomizeUniform();
-  output->randomizeUniform();
-  prevGradX->randomizeUniform();
-  prevGradY->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr gradGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr prevGradXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevGradYGpu = GpuMatrix::create(heightY, width, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  gradGpu->copyFrom(*grad);
-  outputGpu->copyFrom(*output);
-  prevGradXGpu->copyFrom(*prevGradX);
-  prevGradYGpu->copyFrom(*prevGradY);
-
-  grad->cosSimDerivative(
-      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
-
-  gradGpu->cosSimDerivative(*outputGpu,
-                            *prevOutXGpu,
-                            *prevOutYGpu,
-                            *prevGradXGpu,
-                            *prevGradYGpu,
-                            scale);
-
-  TensorCheckErr(*prevGradX, *prevGradXGpu);
-  TensorCheckErr(*prevGradY, *prevGradYGpu);
-}
-
-TEST(Matrix, cosSimDerivate) {
-  for (auto heightX : {1, 10, 100}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {1, 10, 100}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimDerivate(heightX, heightY, width, scale);
-        }
+        testMatrixSequenceAvg(batchSize, inputDim, mode);
       }
     }
   }
@@ -773,10 +774,11 @@ void testParamReluBackwardDiff(int height,
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluBackwardDiff(height, width, w_height, w_width);
         }
       }
@@ -784,7 +786,7 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
-void testClassificationError(int numSamples, int dim) {
+void testClassificationError(int numSamples, int dim, int topkSize) {
   MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
   MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
   MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
@@ -797,17 +799,22 @@ void testClassificationError(int numSamples, int dim) {
   gpuOutput->copyFrom(*cpuOutput);
   gpuLabel->copyFrom(*cpuLabel);
 
-  cpuError->classificationError(*cpuOutput, *cpuLabel);
-  gpuError->classificationError(*gpuOutput, *gpuLabel);
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
 
   TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
-    for (auto dim : {1, 10, 100, 1000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testClassificationError(numSamples, dim);
+  for (auto numSamples : {1, 5, 31, 90, 150, 300}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
+      for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
     }
   }
 }
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 9aa74b15193723970d80b5d1a4e0ac95341cd45a..47f461474622d13ea2f922a77348c78b450ec37f 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -30,6 +30,17 @@ void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
   }
 }
 
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
 void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
                        const CpuSparseMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
@@ -73,6 +84,36 @@ void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
   }
 }
 
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
 void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
                              const CpuMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 65d01a15718ae2bebd4869eff0e5407524bc0e7c..6d9365af2d14673146d9e427138bf6dd5f5b41b6 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -123,46 +123,6 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    CHECK_LE((size_t)startPos + copySize, src->size());
-
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
 static void resizeAndCopy(SVectorPtr& dest,
                           const SVectorPtr& src,
                           bool useGpu,
@@ -223,7 +183,6 @@ void Argument::resizeAndCopyFrom(const Argument& src,
                   false /* useGpu */,
                   stream);
   }
-  resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
@@ -255,7 +214,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
     resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
     resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
     resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
     return copySize;
   } else {
@@ -268,7 +226,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
     resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
     resizeAndCopy(sequenceStartPositions,
                   src.sequenceStartPositions,
                   startSeq,
@@ -583,7 +540,7 @@ void Argument::checkSubset() const {
   }
 }
 
-void Argument::degradeSequence(const Argument& input, bool useGpu) {
+void Argument::degradeSequence(const Argument& input) {
   CHECK_EQ(input.hasSubseq(), 1UL);
   size_t numSequences = input.getNumSequences();
   size_t numSubSequences = input.getNumSubSequences();
@@ -602,6 +559,87 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
+void Argument::poolSequenceWithStride(const Argument& input,
+                                      size_t stride,
+                                      IVectorPtr* stridePostions,
+                                      bool reversed) {
+  // If input.sequenceStartPositions = [0, 9, 14, 17, 30] and stride = 5,
+  // then sequenceStartPositions = [0, 2, 3, 4, 7].
+  // If reversed = false, stridePostions = [0, 5, 9, 14, 17, 22, 27, 30];
+  // else reversed = true, stridePostions = [0, 4, 9, 14, 17, 20, 25, 30]
+
+  CHECK(input.sequenceStartPositions);
+  CHECK_EQ(input.hasSubseq(), 0UL);
+  CHECK_GT(stride, 0UL) << "stride must larger than 0";
+  size_t numSequences = input.getNumSequences();
+  ICpuGpuVector::resizeOrCreate(
+      sequenceStartPositions, numSequences + 1, false);
+  const int* starts = input.sequenceStartPositions->getData(false);
+  int* tgtBuf = sequenceStartPositions->getMutableData(false);
+  // first index of target sequence and stride positions are both 0
+  tgtBuf[0] = 0;
+  std::vector<int> stridePos;
+  for (size_t seqId = 0; seqId < numSequences; ++seqId) {
+    size_t seqLength = starts[seqId + 1] - starts[seqId];
+    stridePos.emplace_back(starts[seqId]);
+    if (seqLength == 0) {
+      // empty sequence
+      tgtBuf[seqId + 1] = tgtBuf[seqId];
+    } else {
+      int size = ceil((float)seqLength / stride);
+      tgtBuf[seqId + 1] = tgtBuf[seqId] + size;
+      for (int i = 0; i < size - 1; ++i) {
+        int cur = reversed ? starts[seqId + 1] - (size - 1 - i) * stride
+                           : stridePos.back() + stride;
+        stridePos.emplace_back(cur);
+      }
+    }
+  }
+  stridePos.emplace_back(starts[numSequences]);
+  int size = stridePos.size();
+  CHECK_EQ(size - 1, tgtBuf[numSequences]);
+  IVector::resizeOrCreate(*stridePostions, size, false);
+  (*stridePostions)->copyFrom(stridePos.data(), size);
+}
+
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
 void Argument::subArgFrom(const Argument& input,
                           size_t offset,
                           size_t height,
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index afd2de0202bf0f14ec3d4c5b856455a3488e41f6..91aca98e186aef0ad6b345cf4791ef80c616e3fe 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -24,8 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 
-// vector of user defined pointers
-typedef std::shared_ptr<std::vector<void*>> UserDefinedVectorPtr;
 typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
 
 struct Argument {
@@ -40,7 +38,6 @@ struct Argument {
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
-        udp(nullptr),
         deviceId(-1),
         allCount(0),
         valueCount(0),
@@ -63,7 +60,6 @@ struct Argument {
     sequenceStartPositions = argument.sequenceStartPositions;
     subSequenceStartPositions = argument.subSequenceStartPositions;
     cpuSequenceDims = argument.cpuSequenceDims;
-    udp = argument.udp;
     deviceId = argument.deviceId;
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
@@ -96,8 +92,6 @@ struct Argument {
   // dimension of sequence, stored only in CPU
   IVectorPtr cpuSequenceDims;
 
-  UserDefinedVectorPtr udp;  // user defined pointer
-
   int deviceId;            // the GPU device id which the argument in
   int allCount;            // the number of output layers using this argument
   mutable int valueCount;  // waiting this member when layer do forward
@@ -137,7 +131,6 @@ struct Argument {
     if (ids) return ids->getSize();
     if (grad) return grad->getHeight();
     if (in) return in->getHeight();
-    if (udp) return udp->size();
     if (strs) return strs->size();
     return 0;
   }
@@ -163,7 +156,7 @@ struct Argument {
                        : sequenceStartPositions->getData(false);
   }
 
-  static inline real sumCosts(const std::vector<Argument>& arguments) {
+  static inline real sum(const std::vector<Argument>& arguments) {
     real cost = 0;
     for (auto& arg : arguments) {
       if (arg.value) {
@@ -296,7 +289,33 @@ struct Argument {
   /*
    sequence has sub-sequence degrades to a sequence.
    */
-  void degradeSequence(const Argument& input, bool useGpu);
+  void degradeSequence(const Argument& input);
+
+  /*
+   After pooling with stride n (n is smaller than sequence length),
+   a long sequence will be shorten.
+   This function is invalid for sequence having sub-sequence.
+   */
+  void poolSequenceWithStride(const Argument& input,
+                              size_t stride,
+                              IVectorPtr* stridePositions,
+                              bool reversed = false);
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
 };
 
 }  // namespace paddle
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 095019b74f4f667991a0d4c5d5511e371889539f..caa78acd98ea4b35fc69643689cfce23026275e0 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -126,7 +126,7 @@ protected:
 /*
  * AdaDelta Optimization.
  * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
-*/
+ */
 class AdaDeltaParameterOptimizer : public ParameterOptimizer {
 public:
   explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
deleted file mode 100644
index cea77e5b1787c25ecb9ccd42e948bf90973fd4cb..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParallelParameter.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <fstream>
-#include "paddle/utils/Logging.h"
-
-#include "ParallelParameter.h"
-
-namespace paddle {
-
-UpdateFunction paramUpdateFunctions[UPDATE_TYPE_NUM] = {
-    nullptr,  // &ParallelParameter::singleUpdate,  /* single thread */
-    nullptr,  // &ParallelParameter::controlUpdate,    /* controller thread */
-    &ParallelParameter::majorUpdate, /* major thread */
-    &ParallelParameter::minorUpdate, /* minor thread */
-
-    nullptr,                         /* master */
-    &ParallelParameter::slaveUpdate, /* slave */
-};
-ParallelParameterPtr ParallelParameter::create(TrainerRole role,
-                                               ParameterPtr localParam,
-                                               int asyncCount) {
-  ParallelParameterPtr ptr = nullptr;
-  switch (role) {
-    case TRAINER_ROLE_CONTROL:
-    case TRAINER_ROLE_MAJOR:
-    case TRAINER_ROLE_MINOR:
-      ptr = std::make_shared<SyncParameter>(role, localParam);
-      break;
-    case TRAINER_ROLE_MASTER:
-    case TRAINER_ROLE_SLAVE:
-      ptr = std::make_shared<AsyncParameter>(role, asyncCount, localParam);
-      break;
-    default:
-      LOG(FATAL) << "unknown role " << role << "\n";
-  }
-  return ptr;
-}
-void ParallelParameter::syncUpdate(TrainerRole role, real learnRate) {
-  if (paramUpdateFunctions[role]) {
-    (this->*paramUpdateFunctions[role])(learnRate);
-  }
-}
-
-void SyncParameter::attachControlParam(ParallelParameterPtr controler) {
-  controlParam_ = controler;
-}
-
-void SyncParameter::attachMajorParam(ParallelParameterPtr partner) {
-  majorPartners_.push_back(partner);
-  if (role_ == TRAINER_ROLE_CONTROL) {
-    localParam_->setSharedCount(majorPartners_.size());
-  }
-  // partnerParam_ = partner;
-}
-
-void SyncParameter::attachMinorParam(ParallelParameterPtr partner,
-                                     int deviceId) {
-  minorPartners_.push_back(partner);
-  minorDeviceIds_.push_back(deviceId);
-  // partnerParam_ = partner;
-}
-
-void SyncParameter::waitAllMajorGradReady() {
-  for (size_t i = 0; i < majorPartners_.size(); i++) {
-    majorPartners_[i]->waitGradReady();
-    partnerParam_ = majorPartners_[i]->getLocalParameter();
-    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
-    VectorPtr patnrGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
-    if (FLAGS_use_gpu) hl_set_device(minorDeviceIds_[i]);
-    localGrad->add(*patnrGrad);
-  }
-}
-
-void SyncParameter::synchronizeParamter() {
-  valueSem_->wait();
-  if (role_ == TRAINER_ROLE_MINOR) {
-    /* copy the value from controller */
-    VectorPtr cntrlVec =
-        (controlParam_->getLocalParameter())->getBuf(PARAMETER_VALUE);
-    VectorPtr localVec = localParam_->getBuf(PARAMETER_VALUE);
-    localVec->copyFrom(*cntrlVec);
-
-    /* dispatch the value to major */
-    for (size_t i = 0; i < majorPartners_.size(); i++) {
-      VectorPtr majorVec =
-          (majorPartners_[i]->getLocalParameter())->getBuf(PARAMETER_VALUE);
-      majorVec->copyFrom(*localVec);
-      majorPartners_[i]->postValueReady();
-    }
-  }
-}
-
-void SyncParameter::singleUpdate(real learnRate) {
-  CHECK(role_ == TRAINER_ROLE_SINGLE);
-  localParam_->updateWithGradient(learnRate);
-}
-
-void SyncParameter::controlUpdate(const UpdateCallback &callBack) {
-  CHECK(role_ == TRAINER_ROLE_CONTROL);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-  CHECK(majorPartners_.size());
-
-  /* update */
-  if (callBack) {
-    callBack(localParam_.get());
-    localParam_->clearGradient();
-  }
-
-  for (size_t i = 0; i < minorPartners_.size(); i++) {
-    minorPartners_[i]->postValueReady();
-  }
-}
-
-void SyncParameter::majorUpdate(real learnRate) {
-  (void)learnRate;
-  CHECK(role_ == TRAINER_ROLE_MAJOR);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-  CHECK(minorPartners_.size() && controlParam_);
-
-  /* wait the minor-Gradient is ready */
-  for (size_t i = 0; i < minorPartners_.size(); i++) {
-    minorPartners_[i]->waitGradReady();
-    partnerParam_ = minorPartners_[i]->getLocalParameter();
-    VectorPtr localGrad = localParam_->getBuf(PARAMETER_GRADIENT);
-    VectorPtr minorGrad = partnerParam_->getBuf(PARAMETER_GRADIENT);
-    localGrad->add(*minorGrad);
-  }
-
-  /* notice the controller that the gradient is ready */
-  gradSem_->post();
-}
-
-void SyncParameter::minorUpdate(real learnRate) {
-  (void)learnRate;
-  CHECK(role_ == TRAINER_ROLE_MINOR);
-  CHECK(gradSem_ != NULL && valueSem_ != NULL);
-
-  // notice the major that the gradient is ready
-  gradSem_->post();
-}
-
-AsyncParameter::AsyncParameter(TrainerRole role,
-                               int asyncCount,
-                               ParameterPtr localParam)
-    : ParallelParameter(role, localParam) {
-  asyncCount_ = asyncCount;
-  accumCounter_ = 0;
-  gradientAccum_ = Vector::create(localParam->getSize(), localParam->useGpu());
-  gradientAccum_->zeroMem();
-}
-
-void AsyncParameter::slaveUpdate(real learnRate) {
-  /* increase the accumCounter_ */
-  accumCounter_++;
-
-  /* accumulate the gradient to the buffer */
-  VectorPtr grad = localParam_->getBuf(PARAMETER_GRADIENT);
-  gradientAccum_->add(*grad);
-
-  /* if need to be synchronized with the master */
-  if (accumCounter_ == asyncCount_) {
-    gradSem_->post();
-    // accumCounter_ = 0; NOTICE: the upper-function need to reset the counter
-  } else {  // self update
-    localParam_->updateWithGradient(learnRate);
-  }
-  localParam_->clearGradient();
-}
-
-bool AsyncParameter::masterUpdate(ParallelParameterPtr slaveParam,
-                                  const UpdateCallback &callback) {
-  CHECK(slaveParam && callback);
-
-  /* wait the slave is ready */
-  if (!slaveParam->timeWaitGradReady(5)) {
-    return false;
-  }
-
-  AsyncParameter *asyncParam = dynamic_cast<AsyncParameter *>(slaveParam.get());
-
-  /* get the accum-gradient to update local parameter */
-  VectorPtr slaveVec = asyncParam->getAccum();
-  localParam_->getBuf(PARAMETER_GRADIENT)->copyFrom(*slaveVec);
-  callback(localParam_.get());
-  // slaveVec->zeroMem();
-
-  /* copy the newest parameter-value to the slave */
-  slaveVec = (slaveParam->getLocalParameter())->getBuf(PARAMETER_VALUE);
-  slaveVec->copyFrom(*(localParam_->getBuf(PARAMETER_VALUE)));
-
-  /* release the semphore */
-  slaveParam->postValueReady();
-
-  return true;
-}
-
-}  // namespace paddle
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
deleted file mode 100644
index 2e7c18b8084dc25b9f2f7630390bb4553ac703c9..0000000000000000000000000000000000000000
--- a/paddle/parameter/ParallelParameter.h
+++ /dev/null
@@ -1,244 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stdint.h>
-
-#include <sys/time.h>
-#include <unistd.h>
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "hl_gpu.h"
-#include "paddle/math/Vector.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
-#include "paddle/utils/Common.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
-
-#include "ParameterConfig.pb.h"
-
-namespace paddle {
-
-class ParallelParameter;
-class SyncParameter;
-class AsyncParameter;
-
-typedef std::shared_ptr<ParallelParameter> ParallelParameterPtr;
-
-const int UPDATE_TYPE_NUM = 32;
-
-/**
- * TrainRole denotes the role of current training, different roles have
- * different jobs.
- *
- * control, major, minor are three kinds of role to support mutiple GPUs
- * parallel SGD training. SM on GPU card has two groups, each group
- * consist of a major and a minor.
- *
- * @param    single  single GPU card single thread training.
- *
- *
- * @param    control current parameter updates via control role,
- *                   not participate in real training. control role is
- *                   responsible for merging all major's gradient and
- *                   update parameter value.
- *
- * @param    major   major role paticipates in real training, when local
- *                   gradient is ready, merge its corresponding minor's
- *                   gradient and notify controller: this group's gradient
- *                   is already ready.
- *
- * @param    minor   minor role participates in real training, when local
- *                   gradient is ready, only notify its corresponding major.
- *                   In order to maximum apportion jobs, after controller
- *                   updates the paramemter value, each group's minior
- *                   reponses to dispatch the latest model into local and
- *                   major.
- */
-enum TrainerRole {
-  TRAINER_ROLE_SINGLE,
-  TRAINER_ROLE_CONTROL,
-  TRAINER_ROLE_MAJOR,
-  TRAINER_ROLE_MINOR,
-  TRAINER_ROLE_MASTER,
-  TRAINER_ROLE_SLAVE
-};
-typedef void (ParallelParameter::*UpdateFunction)(real learnRate);
-
-class ParallelParameter {
-public:
-  static ParallelParameterPtr create(TrainerRole role,
-                                     ParameterPtr localParam,
-                                     int asyncCount = 1);
-
-  ParallelParameter(TrainerRole role, ParameterPtr localParam) {
-    role_ = role;
-    gradSem_.reset(new Semaphore(0));
-    valueSem_.reset(new Semaphore(0));
-    localParam_ = localParam;
-  }
-
-  virtual ~ParallelParameter() {}
-
-  ParameterPtr getLocalParameter() { return localParam_; }
-  bool timeWaitGradReady(int sec) {
-    struct timespec ts;
-    ts.tv_nsec = 0;
-    ts.tv_sec = time(NULL) + sec;
-    return gradSem_->timeWait(&ts);
-  }
-  void waitGradReady() { gradSem_->wait(); }
-  void postValueReady() { valueSem_->post(); }
-
-  void syncUpdate(TrainerRole role, real learnRate);
-
-  virtual void synchronizeParamter() = 0;
-
-  /**
-   * for synchronous
-   */
-  virtual void singleUpdate(real learnRate) { (void)learnRate; }
-
-  virtual void controlUpdate(const UpdateCallback& callback) { (void)callback; }
-
-  virtual void majorUpdate(real learnRate) { (void)learnRate; }
-
-  virtual void minorUpdate(real learnRate) { (void)learnRate; }
-
-  /**
-   * for asynchronous
-   */
-  virtual void slaveUpdate(real learnRate) { (void)learnRate; }
-
-protected:
-  TrainerRole role_;
-  ParameterPtr localParam_;
-  std::unique_ptr<Semaphore>
-      gradSem_;  /// wether the local parameter-gradient is ready
-  std::unique_ptr<Semaphore>
-      valueSem_;  /// wether the local parameter-value is updated
-};
-
-/**
- * this class is designed for multi-threading training.
- *
- * "Synchronous" means multiple GPUs calculate 1/4 mini-Batch,
- * but will get only one gradient
- */
-class SyncParameter : public ParallelParameter {
-public:
-  SyncParameter(TrainerRole role, ParameterPtr localParam)
-      : ParallelParameter(role, localParam) {
-    controlParam_ = nullptr;
-    majorPartners_.clear();
-    minorPartners_.clear();
-  }
-  ~SyncParameter() {
-    majorPartners_.clear();
-    minorPartners_.clear();
-  }
-  void attachControlParam(ParallelParameterPtr controler);
-
-  void attachMajorParam(ParallelParameterPtr partner);
-
-  void attachMinorParam(ParallelParameterPtr partner, int deviceId);
-
-  void waitAllMajorGradReady();
-
-  void synchronizeParamter();
-
-  void singleUpdate(real learnRate);
-
-  void controlUpdate(const UpdateCallback& callback);
-
-  void majorUpdate(real learnRate);
-
-  void minorUpdate(real learnRate);
-
-  std::vector<ParallelParameterPtr>& getMajorPartners() {
-    return majorPartners_;
-  }
-
-  std::vector<ParallelParameterPtr>& getMinorPartners() {
-    return minorPartners_;
-  }
-
-private:
-  // The following variables are used in a multithreaded training situation
-  // partnerParam_ is local-parameter's partner
-  // controlParam_ is the controller-thread 's parameter
-  ParameterPtr partnerParam_;
-  std::vector<ParallelParameterPtr> majorPartners_;
-  std::vector<ParallelParameterPtr> minorPartners_;
-  std::vector<int> minorDeviceIds_;
-  ParallelParameterPtr controlParam_;
-};
-
-class AsyncParameter : public ParallelParameter {
-public:
-  AsyncParameter(TrainerRole role, int asyncCount, ParameterPtr localParam);
-
-  void clearCounter() { accumCounter_ = 0; }
-
-  VectorPtr getAccum() { return gradientAccum_; }
-
-  void synchronizeParamter() {
-    if (accumCounter_ == asyncCount_) {
-      valueSem_->wait();
-      clearCounter();
-      gradientAccum_->zeroMem();
-    }
-  }
-
-  /**
-   * When asynchronous training, update strategy including slave and master.
-   *
-   * slave: If in range asyncCount, adopting self-update method.
-   *        If beyond asyncCount, waiting for master to update.
-   */
-  void slaveUpdate(real learnRate);
-
-  /**
-   * When asynchronous training, update strategy including slave and master.
-   *
-   * master: it only polls slaves, do not training data.
-   *         If slave's gradient is ready, fetch it.
-   *         Update master's parameter, then copy it into
-   *         corresponding slave.
-   */
-  bool masterUpdate(ParallelParameterPtr slaveParam,
-                    const UpdateCallback& callback);
-
-private:
-  /**
-   * When asynchronous training, every aysnc trainer needs to
-   * accumulate a number of batch gradient.
-   *
-   * gradientAccum_ is used to save the sum of gradients.
-   */
-  VectorPtr gradientAccum_;
-
-  /// Asynchronous count.
-  int asyncCount_;
-  /// Accumulate counter of current gradients.
-  int accumCounter_;
-};
-
-typedef std::map<std::string, ParallelParameterPtr> ParallelParameterMap;
-
-}  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 29d6e20dc16968cdda3e79b66b0c81aaaf303ef4..b8efabbe2a0b54edec64f6cee62b44c76ca7bf10 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -271,55 +271,6 @@ SparsePrefetchRowCpuMatrix* Parameter::getPrefetchMatrix() {
   return nullptr;
 }
 
-void Parameter::updateWithGradient(real learningRate) {
-  sgdUpdate(learningRate * config_.learning_rate(),
-            config_.momentum(),
-            config_.decay_rate(),
-            bufs_[PARAMETER_VALUE].get(),
-            bufs_[PARAMETER_GRADIENT].get(),
-            bufs_[PARAMETER_MOMENTUM].get());
-}
-
-void Parameter::updateWithGradient(real learningRate,
-                                   MatrixPtr gradMat,
-                                   IVectorPtr t0,
-                                   int currentTime,
-                                   bool fini) {
-  SparseRowCpuMatrix* sparseMat =
-      dynamic_cast<SparseRowCpuMatrix*>(gradMat.get());
-  CHECK(sparseMat);
-  CHECK_EQ(config_.momentum(), 0.0f)
-      << "not support momentum in sparse input sgd";
-  bool useL1 = (config_.decay_rate_l1() != 0.0f);
-  sparseMat->sgdUpdate(*bufs_[PARAMETER_VALUE],
-                       *t0,
-                       learningRate * config_.learning_rate(),
-                       currentTime,
-                       useL1 ? config_.decay_rate_l1() : config_.decay_rate(),
-                       useL1,
-                       fini);
-}
-
-void Parameter::updateWithGradient(real learningRate,
-                                   VectorPtr gradVec,
-                                   bool normalUpdate) {
-  if (normalUpdate) {
-    sgdUpdate(learningRate * config_.learning_rate(),
-              config_.momentum(),
-              config_.decay_rate(),
-              bufs_[PARAMETER_VALUE].get(),
-              gradVec.get(),
-              bufs_[PARAMETER_MOMENTUM].get());
-  } else {
-    size_t size = gradVec->getSize();
-    real* mom = bufs_[PARAMETER_MOMENTUM]->getData();
-    real* grad = gradVec->getData();
-    real* value = bufs_[PARAMETER_VALUE]->getData();
-    hl_matrix_add(mom, grad, mom, 1, size, 1.0f, learningRate);
-    hl_matrix_add(value, grad, value, 1, size, 1.0f, learningRate);
-  }
-}
-
 void Parameter::incUpdate(const UpdateCallback& callback) {
   // Static parameter is fixed, and does not need to be updated
   if (isStatic()) {
@@ -375,10 +326,6 @@ bool Parameter::load(const std::string& filename) {
   std::ifstream fs(filename, std::ios_base::binary);
   if (!fs) {
     LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (isStatic()) {
-      LOG(FATAL) << getName() << " is static but missing, not allowed.";
-      return false;
-    }
     if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
       LOG(FATAL) << getName() << " missing, not allowed.";
       return false;
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 72c8336799133ad3f5855b0c1aa06639179ff70a..36d2b65f3bd1056a4ac6a1029000fe4cce6420ce 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -223,29 +223,6 @@ public:
 
   bool isValueUpdated() const { return updated_; }
 
-  /**
-   * Update bufs_[PARAMETER_VALUE] using bufs_[PARAMETER_GRADIENT]
-   */
-  void updateWithGradient(real learningRate);
-
-  /**
-   * Update bufs_[PARAMETER_VALUE] using sparse row grad matrix.
-   *
-   * @see SparseRowCpuMatrix::sgdUpdate for more information.
-   */
-  void updateWithGradient(real learningRate,
-                          MatrixPtr gradMat,
-                          IVectorPtr t0,
-                          int currentTime,
-                          bool fini = false);
-
-  /**
-   * This function is used to calculate multiple gpus, but only as a candidate
-   */
-  void updateWithGradient(real learningRate,
-                          VectorPtr grad,
-                          bool normalUpdate = true);
-
   /**
    * Save parameter value to a file
    */
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index b230e170c15f1b004c5357fb7d0ad2204d01f44b..6265c828a1a254d01dc975b0155e7ac69df49a31 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -55,7 +55,7 @@ public:
   // between startBatch() and finishBatch(), update() will be called
   // by the trainer multiple times, each time for updating one Parameter
   // with its gradient in PARAMETER_GRADIENT
-  virtual void update(Parameter* para) {
+  void update(Parameter* para) {
     SetDevice setDevice(para->getDeviceId());
     para->updateHook();
     this->updateImpl(para);
diff --git a/paddle/parameter/tests/CMakeLists.txt b/paddle/parameter/tests/CMakeLists.txt
index cab264db8e5000e8eb61830ec07e9f590c103119..181ccdc1f099e8d61a44c1741116abe7afe0f11d 100644
--- a/paddle/parameter/tests/CMakeLists.txt
+++ b/paddle/parameter/tests/CMakeLists.txt
@@ -1 +1,2 @@
 add_simple_unittest(test_common)
+add_simple_unittest(test_argument)
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81fe4ee397351a013c8616ad08fb8cb4b8dae4d0
--- /dev/null
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -0,0 +1,57 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <paddle/parameter/Argument.h>
+
+using namespace paddle;  // NOLINT
+
+TEST(Argument, poolSequenceWithStride) {
+  Argument input, output;
+  ICpuGpuVector::resizeOrCreate(input.sequenceStartPositions, 5, false);
+  int* inStart = input.sequenceStartPositions->getMutableData(false);
+  inStart[0] = 0;
+  inStart[1] = 9;
+  inStart[2] = 14;
+  inStart[3] = 17;
+  inStart[4] = 30;
+
+  int strideResult[] = {0, 5, 9, 14, 17, 22, 27, 30};
+  int strideResultReversed[] = {0, 4, 9, 14, 17, 20, 25, 30};
+
+  for (auto reversed : {false, true}) {
+    IVectorPtr stridePositions;
+    output.poolSequenceWithStride(
+        input, 5 /* stride */, &stridePositions, reversed);
+
+    const int* outStart = output.sequenceStartPositions->getData(false);
+    CHECK_EQ(outStart[0], 0);
+    CHECK_EQ(outStart[1], 2);
+    CHECK_EQ(outStart[2], 3);
+    CHECK_EQ(outStart[3], 4);
+    CHECK_EQ(outStart[4], 7);
+
+    CHECK_EQ(stridePositions->getSize(), 8);
+    auto result = reversed ? strideResultReversed : strideResult;
+    for (int i = 0; i < 8; i++) {
+      CHECK_EQ(stridePositions->getData()[i], result[i]);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 11d7a147bf749ba2de0772b5efd5f73ab0ccdb1a..667bc451d16aa1436ac5d74dd96edbd70556edd0 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -30,9 +30,6 @@ namespace paddle {
  * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
  * recvJobQueue_. the second solution use some shared thread pool to manage
  * connections.
- * In addition to pserver, metric learning also uses network to exchange
- * features within multi-machines, so this class just abstracts some basic
- * threads and queue buffer creation for them
  */
 class BaseClient {
 protected:
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 1c1e1964b8d3fd83c801f3988760a72dfc032e7f..b7f85ea1a6dfda2a37c315ba15c6ca1979cf4131 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
     BaseClient.cpp
     ParameterClient2.cpp
     ParameterServer2.cpp
-    SparseParameterDistribution.cpp)
+    SparseParameterDistribution.cpp
+    ParameterServerController.cpp)
 
 set(PSERVER_HEADERS
     BaseClient.h
     ParameterClient2.h
     ParameterServer2.h
-    SparseParameterDistribution.h)
+    SparseParameterDistribution.h
+    ParameterServerController.h)
 
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 856fa0ad1ab30e3fc554ac96dd3bed71b1548579..19ff40ba7e9584f772043f939bcb31caf666163d 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
 DEFINE_double(async_lagged_ratio_min,
@@ -218,7 +219,8 @@ void ParameterServer2::setConfig(const SetConfigRequest& request,
   callback(response);
 
   /// always defined, barrier slowest node function need it.
-  statSet_.reset(new StatSet("ParameterServer" + std::to_string(serverId_)));
+  statSet_.reset(new StatSet("ParameterServer" +
+                             str::to_string(static_cast<int>(serverId_))));
 }
 
 real bufferSum(const std::vector<ParameterServer2::Buffer>& buffers) {
@@ -367,11 +369,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-/// forwardbackward delta from all trainers
-/// indicate the fluctuation caused by forwardbackward.
-#ifndef PADDLE_METRIC_LEARNING
-  // @TODO(yanfei):
-  // add support tuning forwardbackward balance for metric learning
+  // forwardbackward delta from all trainers
+  // indicate the fluctuation caused by forwardbackward.
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
         *statSet_,
@@ -381,7 +380,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
         request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
-#endif
 
   {
     /// approximately pure network overhead
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index ffc521f2c143d95ff07c3825e0a746cb31743d9b..845a2c27e242cfbe31679fea6eae13d2b400ec81 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterServer2.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/Flags.h"
+#include "ParameterServerController.h"
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::vector<std::string> devices;
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-
-  // round robin to loadbalance RDMA server ENGINE
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-  if (FLAGS_nics.empty()) {
-    pservers.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
-        pservers[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      }
-      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                 << FLAGS_port + i;
-      LOG(INFO) << "pserver started : " << FLAGS_port + i;
-      pservers[i]->start();
-    }
-  } else {
-    str::split(FLAGS_nics, ',', &devices);
-    pservers.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-        }
-        CHECK(pservers[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << devices[j] << ":"
-                  << FLAGS_port + i;
-        pservers[i * devices.size() + j]->start();
-      }
-    }
-  }
-
-  for (auto& pserver : pservers) {
-    pserver->join();
-  }
+  std::unique_ptr<ParameterServerController> parameterServerPtr(
+      paddle::ParameterServerController::createFromGflags());
+  parameterServerPtr->start();
+  parameterServerPtr->wait();
 
   return 0;
 }
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/pserver/ParameterServerController.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d11a2e1acbc0f091901f3854ca99490d89ebe36
--- /dev/null
+++ b/paddle/pserver/ParameterServerController.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServerController.h"
+
+namespace paddle {
+
+ParameterServerController::ParameterServerController(
+    const ParameterServerConfig& config) {
+  // round robin to load balance RDMA server ENGINE
+  std::vector<std::string> devices;
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  int numPorts = config.ports_num() + config.ports_num_for_sparse();
+
+  if (config.nics().empty()) {
+    parameterServers_.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (config.rdma_tcp() == "rdma") {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i));
+      }
+      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
+                                             "server on port "
+                                          << config.port() + i;
+    }
+  } else {
+    str::split(config.nics(), ',', &devices);
+    parameterServers_.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (config.rdma_tcp() == "rdma") {
+          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          parameterServers_[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
+        }
+        CHECK(parameterServers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server with device " << devices[j]
+            << config.port() + i;
+      }
+    }
+  }
+}
+
+ParameterServerController::~ParameterServerController() { this->wait(); }
+
+ParameterServerController* ParameterServerController::createFromGflags() {
+  ParameterServerConfig config;
+
+  config.set_nics(FLAGS_nics);
+  config.set_rdma_tcp(FLAGS_rdma_tcp);
+  config.set_port(FLAGS_port);
+  config.set_ports_num(FLAGS_ports_num);
+  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
+
+  return create(config);
+}
+
+ParameterServerController* ParameterServerController::create(
+    const ParameterServerConfig& config) {
+  return new ParameterServerController(config);
+}
+
+void ParameterServerController::start() {
+  LOG(INFO) << "number of parameterServer instances: "
+            << parameterServers_.size();
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Starting parameterServer[" << i << "]";
+    parameterServer->start();
+    i++;
+  }
+}
+
+void ParameterServerController::wait() {
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Waiting parameterServer[" << i << "]";
+    parameterServer->join();
+    i++;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe9bb0b4d02339d0d31d5bc2942932e1f876098b
--- /dev/null
+++ b/paddle/pserver/ParameterServerController.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+public:
+  DISABLE_COPY(ParameterServerController);
+
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+
+private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
index 64654f67d0c2c82f05a5038fb33b220f3cff0f39..6e8f9c37f64b70921e09241089a5a480fd8ca47f 100644
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -10,9 +10,11 @@ add_test(NAME socket_test
 add_unittest_without_exec(test_ProtoServer
     test_ProtoServer.cpp)
 
-add_test(NAME test_ProtoServer
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
-        ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+IF(NOT ON_TRAVIS)
+    add_test(NAME test_ProtoServer
+        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+            ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+ENDIF(NOT ON_TRAVIS)
 
 # TODO(yuyang18): Run test_ProtoServer when with rdma
 # add_test(NAME test_ProtoServerRDMA
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 9f86ee80f4e5cc99ea3597b3ed37a387578f032a..04236fda2fb62b928b5c06ff38acfd3eb7217b08 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-
 #include <gtest/gtest.h>
-
+#include <memory>
 #include "ParameterService.pb.h"
 #include "paddle/math/Vector.h"
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
 DEFINE_string(server_addr, "127.0.0.1", "Server address");
 DEFINE_int64(dim, 50000000, "Data size");
@@ -162,18 +161,9 @@ TEST(ProtoServer, extended) {
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
-
-  MyServer* server;
-  if (FLAGS_rdma_tcp == "rdma") {
-    server = new MyServer(FLAGS_port, 0);
-  } else {
-    server = new MyServer(FLAGS_port);
-  }
-
-  server->start();
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
   usleep(10000);
 
-  int ret = RUN_ALL_TESTS();
-
-  exit(ret);
+  return RUN_ALL_TESTS();
 }
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
index 9e8ad4bf1638a69ab7ef19badfbf867e116548d2..80d1f76fbc05627e21e334af55d63a4a534434c6 100644
--- a/paddle/py_paddle/.gitignore
+++ b/paddle/py_paddle/.gitignore
@@ -1 +1,2 @@
 swig_paddle.py
+_swig_paddle.so
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 981d10afda2671be9e8f0da1a4bee755f7aa9d61..7c6b83541002071d6e9d00c17be97b6ce4bf8528 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -16,38 +16,105 @@ import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
 import numpy
+import itertools
 
 __all__ = ['DataProviderConverter']
 
 
 class IScanner(object):
+    """
+    The scanner will scan Python object two passes, then convert it to Paddle's
+    argument.
+
+    In the first pass, `pre_scan` will be invoked by every data instance, and
+    then invoke `finish_pre_scan` to arguments. And the second pass do the same
+    thing except the functions changed to `scan`, `finish_scan`.
+
+    During the first pass, a scanner may count the shape of input matrix and
+    allocate memory for this argument. Then fill the data into this  argument
+    in second pass.
+    """
+
     def __init__(self, input_type, pos):
         self.input_type = input_type
-        assert isinstance(self.input_type, dp2.InputType)
+        if not isinstance(self.input_type, dp2.InputType):
+            raise ValueError("input type should be dataprovider2.InputType")
         self.pos = pos
+        # data_in_gpu is used to indicate whether to create argument on GPU
+        # or not in GPU mode. Now if using one thread (trainer_count=1),
+        # trainer uses NeuralNetwork which needs to create argument on GPU
+        # before calling forward function. So, set data_in_gpu to True.
+        # Otherwise, trainer uses MultiGradientMachine which will transfer
+        # data from CPU to GPU in the forward function, set data_in_gpu to
+        # False in this case.
+        self.data_in_gpu = swig_paddle.isUsingGpu(
+        ) and swig_paddle.getTrainerCount() == 1
+
+    def pre_scan(self, dat):
+        """
+        First pass scan method. During this method, the scanner could count the
+        data number, and get the total memory size this batch would use.
+
+        :param dat: The python object.
+        """
+        pass
+
+    def finish_pre_scan(self, argument):
+        """
+        Finish first scan pass. Allocate the memory.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        :return:
+        """
+        pass
 
     def scan(self, dat):
+        """
+        Second pass scan method. Copy the data to arguments.
+
+        :param dat: The python object.
+        """
         pass
 
     def finish_scan(self, argument):
+        """
+        Finish second pass. Finalize the resources, etc.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        """
         pass
 
 
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
+
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
+        self.__height__ = 0
+
+    def pre_scan(self, dat):
+        self.__height__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__mat__ = numpy.ndarray(
+            shape=(self.__height__, self.input_type.dim), dtype=numpy.float32)
+        self.__height__ = 0
 
     def scan(self, dat):
-        if self.__mat__ is None:
-            self.__mat__ = numpy.array([dat], dtype='float32')
-        else:
-            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
+        self.__mat__[self.__height__] = dat
+        self.__height__ += 1
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
+                                                    self.data_in_gpu)
         argument.setSlotValue(self.pos, m)
 
 
@@ -57,7 +124,6 @@ class SparseBinaryScanner(IScanner):
         self.__rows__ = [0]
         self.__cols__ = []
         self.__height__ = 0
-        self.__nnz__ = 0
         self.__value__ = []
 
     def scan(self, dat):
@@ -70,11 +136,13 @@ class SparseBinaryScanner(IScanner):
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createSparse(self.__height__,
-                                            self.input_type.dim,
-                                            len(self.__cols__),
-                                            len(self.__value__) == 0)
+        m = swig_paddle.Matrix.createSparse(
+            self.__height__,
+            self.input_type.dim,
+            len(self.__cols__),
+            len(self.__value__) == 0,
+            False,  # trans
+            False)  # TODO supoort GPU
         assert isinstance(m, swig_paddle.Matrix)
         m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
         argument.setSlotValue(self.pos, m)
@@ -92,13 +160,22 @@ class SparseFloatScanner(SparseBinaryScanner):
 class IndexScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
-        self.__ids__ = []
+        self.__ids__ = None
+        self.__idx__ = 0
+
+    def pre_scan(self, dat):
+        self.__idx__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__ids__ = [0] * self.__idx__
+        self.__idx__ = 0
 
     def scan(self, dat):
-        self.__ids__.append(dat)
+        self.__ids__[self.__idx__] = dat
+        self.__idx__ += 1
 
     def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__)
+        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
         assert isinstance(argument, swig_paddle.Arguments)
         argument.setSlotIds(self.pos, ids)
 
@@ -110,6 +187,13 @@ class SequenceScanner(IScanner):
         self.__inner_scanner__ = inner_scanner
         self.__setter__ = setter
 
+    def pre_scan(self, dat):
+        for each in dat:
+            self.__inner_scanner__.pre_scan(each)
+
+    def finish_pre_scan(self, argument):
+        self.__inner_scanner__.finish_pre_scan(argument)
+
     def scan(self, dat):
         self.__seq__.append(self.__seq__[-1] + self.get_size(dat))
         for each in dat:
@@ -146,7 +230,14 @@ class DataProviderConverter(object):
         ]
 
         for each_sample in dat:
-            for each_step, scanner in zip(each_sample, scanners):
+            for each_step, scanner in itertools.izip(each_sample, scanners):
+                scanner.pre_scan(each_step)
+
+        for scanner in scanners:
+            scanner.finish_pre_scan(argument)
+
+        for each_sample in dat:
+            for each_step, scanner in itertools.izip(each_sample, scanners):
                 scanner.scan(each_step)
 
         for scanner in scanners:
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index ce105d249aaf3e838443d3e0cf5996fe8c783a22..3ae8dbf964c68c6f01ba30cb3ac69fb6c2f08c30 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -83,13 +83,17 @@ def __arguments_to_numpy__(i, arg):
     assert isinstance(arg, swig_paddle.Arguments)
     value = arg.getSlotValue(i)
     ids = arg.getSlotIds(i)
+    prob = arg.getSlotIn(i)
     if value is not None:
         assert isinstance(value, swig_paddle.Matrix)
         value = value.copyToNumpyMat()
     if ids is not None:
         assert isinstance(ids, swig_paddle.IVector)
         ids = ids.copyToNumpyArray()
-    return {"value": value, "id": ids}
+    if prob is not None:
+        assert isinstance(prob, swig_paddle.Matrix)
+        prob = prob.copyToNumpyMat()
+    return {"value": value, "id": ids, "prob": prob}
 
 
 def __monkeypatch_gradient_machine__():
@@ -195,6 +199,12 @@ def __monkeypatch_gradient_machine__():
 
     swig_paddle.GradientMachine.getParameters = getParameters
 
+    def getNonStaticParameters(self):
+        return (self.getNonStaticParameter(i)
+                for i in xrange(self.getNonStaticParameterSize()))
+
+    swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters
+
     def getLayerOutputs(self, layerNames):
         """
         getLayerOutputs. get outputs of layers and return a numpy matrix dict.
@@ -208,7 +218,7 @@ def __monkeypatch_gradient_machine__():
 
         output = dict()
         for name in layerNames:
-            output[name] = __matrix_to_numpy__(self.getLayerOutput(name))
+            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
         return output
 
     swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 1bae396a18688cd53e164774df07660ccc2451d7..66a46e1883a49d491f0cb3056a7039407d72e337 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
     submit_local.sh
     @ONLY)
 
-
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
         RENAME paddle)
+
+configure_file(tools/usage_stat/usage.sh
+    usage.sh
+    @ONLY)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle_usage)
diff --git a/paddle/scripts/cpplint.py b/paddle/scripts/cpplint.py
index 157ce7b44ac3cfe3a8ca5eda78e959cf7be4cc5b..dff4339ea33b72e22104a56183e3302067dc583d 100644
--- a/paddle/scripts/cpplint.py
+++ b/paddle/scripts/cpplint.py
@@ -58,6 +58,7 @@ _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
                    [--linelength=digits]
+                   [--write-success=success_status_file]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
@@ -499,6 +500,8 @@ _line_length = 80
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+_write_success = None
+
 
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
     """Updates the global list of error-suppressions.
@@ -6337,7 +6340,7 @@ def ParseArguments(args):
     try:
         (opts, filenames) = getopt.getopt(args, '', [
             'help', 'output=', 'verbose=', 'counting=', 'filter=', 'root=',
-            'linelength=', 'extensions='
+            'linelength=', 'extensions=', 'write-success='
         ])
     except getopt.GetoptError:
         PrintUsage('Invalid arguments.')
@@ -6382,6 +6385,9 @@ def ParseArguments(args):
                 _valid_extensions = set(val.split(','))
             except ValueError:
                 PrintUsage('Extensions must be comma seperated list.')
+        elif opt == '--write-success':
+            global _write_success
+            _write_success = val
 
     if not filenames:
         PrintUsage('No files were specified.')
@@ -6408,6 +6414,10 @@ def main():
         ProcessFile(filename, _cpplint_state.verbose_level)
     _cpplint_state.PrintErrorCounts()
 
+    if _cpplint_state.error_count == 0 and _write_success is not None:
+        with open(_write_success, 'a'):
+            os.utime(_write_success, None)
+
     sys.exit(_cpplint_state.error_count > 0)
 
 
diff --git a/paddle/scripts/deb/build_scripts/.gitignore b/paddle/scripts/deb/build_scripts/.gitignore
deleted file mode 100644
index 1521c8b7652b1eec8ed4fe50877aae880c758ee3..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dist
diff --git a/paddle/scripts/deb/build_scripts/Dockerfile b/paddle/scripts/deb/build_scripts/Dockerfile
deleted file mode 100644
index db365a65b7d33429dc1260b2ce69d6dc46abe487..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-FROM paddledev/paddle:gpu-latest
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-CMD cd /root/ && bash build.sh
-
diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh
deleted file mode 100755
index d13dea514841b110c304b8aa0e65ad16e42c75f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-apt-get update
-apt-get install -y dh-make
-cd ~
-mkdir -p ~/dist/gpu
-mkdir -p ~/dist/cpu
-mkdir -p ~/dist/cpu-noavx
-mkdir -p ~/dist/gpu-noavx
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=ON
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=ON -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu
-
-
-rm -rf *
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=OFF
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu-noavx
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu-noavx
diff --git a/paddle/scripts/deb/build_scripts/build_deb.sh b/paddle/scripts/deb/build_scripts/build_deb.sh
deleted file mode 100755
index c38c6299f840345b7f6f6e0aad7482241d36198a..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/build_deb.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e
-docker build -t build_paddle_deb .
-rm -rf dist
-mkdir -p dist
-docker run -v$PWD/dist:/root/dist -v $PWD/../../../..:/root/paddle --name tmp_build_deb_container build_paddle_deb
-docker rm tmp_build_deb_container
-docker rmi build_paddle_deb
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
index 1d2dd3171a132966832d87ae758d4e620475aed1..91620b1ee7569cd17927f44112dfa9279ddbdd32 100644
--- a/paddle/scripts/deb/postinst
+++ b/paddle/scripts/deb/postinst
@@ -3,5 +3,4 @@ set -e
 echo "Post install paddle debian package."
 echo "Install some python package used for paddle. You can run "
 echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
-pip install /usr/opt/paddle/share/wheels/*.whl
-
+find /usr/ -name '*paddle*.whl' | xargs pip install
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
deleted file mode 100644
index 1522be023f6de32f86fc8a367867bbe2f1c9aeb6..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile
+++ /dev/null
@@ -1,55 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG DEBIAN_FRONTEND=noninteractive
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark jupyter
-
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_SWIG_PY
-ARG WITH_STYLE_CHECK
-
-ENV WITH_GPU=OFF
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-
-RUN mkdir /paddle
-COPY . /paddle/
-RUN /paddle/paddle/scripts/docker/build.sh
-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
-
-RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
-RUN paddle version  # print version after build
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
-
-# Jupyter Notebook directory.
-RUN mkdir /notes/
-WORKDIR "/notes"
-EXPOSE 8888
-
-RUN mkdir -p /opt/bin
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
deleted file mode 100644
index 09f07043e2172319de257cc952fb81ba53ce89a5..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ /dev/null
@@ -1,55 +0,0 @@
-FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG DEBIAN_FRONTEND=noninteractive
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark jupyter
-
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_SWIG_PY
-ARG WITH_STYLE_CHECK
-
-ENV WITH_GPU=ON
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-
-RUN mkdir /paddle
-COPY . /paddle/
-RUN /paddle/paddle/scripts/docker/build.sh
-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
-
-RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
-RUN paddle version  # print version after build
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
-
-# Jupyter Notebook directory.
-RUN mkdir /notes/
-WORKDIR "/notes"
-EXPOSE 8888
-
-RUN mkdir -p /opt/bin
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..76bc30e59b869d705b6188592b2983ed01114046
--- /dev/null
+++ b/paddle/scripts/docker/README.md
@@ -0,0 +1,183 @@
+# Building PaddlePaddle
+
+## Goals
+
+We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters.
+
+We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker.
+
+We want to minimize the size of generated Docker images and .deb packages so to reduce the download time.
+
+We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+
+Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container.
+
+We want the procedure and tools also work with testing, continuous integration, and releasing.
+
+
+## Docker Images
+
+So we need two Docker images for each version of PaddlePaddle:
+
+1. `paddle:<version>-dev`
+
+   This a development image contains only the development tools and standardizes the building procedure.  Users include:
+
+   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
+   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+
+   Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
+
+  The development image should include the following tools:
+
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+
+   Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
+
+1. `paddle:<version>`
+
+   This is the production image, generated using the development image. This image might have multiple variants:
+
+   - GPU/AVX   `paddle:<version>-gpu`
+   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
+   - no-GPU/AVX  `paddle:<version>`
+   - no-GPU/no-AVX  `paddle:<version>-noavx`
+
+   We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version.
+
+   We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
+
+
+## Development Environment
+
+Here we describe how to use above two images.  We start from considering our daily development environment.
+
+Developers work on a computer, which is usually a laptop or desktop:
+
+<img src="doc/paddle-development-environment.png" width=500 />
+
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+
+
+## Usages
+
+### Build the Development Docker Image
+
+The following commands check out the source code to the host and build the development image `paddle:dev`:
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle paddle
+cd paddle
+docker build -t paddle:dev .
+```
+
+The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
+
+Users can specify a Ubuntu mirror server for faster downloading:
+
+```bash
+docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
+```
+
+### Build PaddlePaddle from Source Code
+
+Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+
+```bash
+docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TESTING=OFF" -e "RUN_TEST=OFF" paddle:dev
+```
+
+This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
+
+`build.sh` builds the following:
+
+- PaddlePaddle binaries,
+- `$PWD/build/paddle-<version>.deb` for production installation, and
+- `$PWD/build/Dockerfile`, which builds the production Docker image.
+
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
+- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
+- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
+- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
+  ```bash
+    docker run --rm -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
+  ```
+- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
+
+### Build the Production Docker Image
+
+The following command builds the production image:
+
+```bash
+docker build -t paddle -f build/Dockerfile ./build
+```
+
+This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
+
+### Run PaddlePaddle Applications
+
+Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
+
+```bash
+docker run --rm -it -v $PWD:/work paddle /work/a.py
+```
+
+But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+
+### Build and Run PaddlePaddle Applications
+
+We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+
+```
+FROM paddlepaddle/paddle:<version>
+RUN pip install -U matplotlib jupyter ...
+COPY . /book
+EXPOSE 8080
+CMD ["jupyter"]
+```
+
+The book image is an example of PaddlePaddle application image.  We can build it
+
+```bash
+git clone https://github.com/paddlepaddle/book
+cd book
+docker build -t book .
+```
+
+### Build and Run Distributed Applications
+
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+
+Of course, we can manually build an application image and launch the job using the kubectl tool:
+
+```bash
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
+```
+
+### Reading source code with woboq codebrowser
+For developers who are interested in the C++ source code, please use -e "WOBOQ=ON" to enable the building of C++ source code into HTML pages using [Woboq codebrowser](https://github.com/woboq/woboq_codebrowser).
+
+- The following command builds PaddlePaddle, generates HTML pages from C++ source code, and writes HTML pages into `$HOME/woboq_out` on the host:
+
+```bash
+docker run -v $PWD:/paddle -v $HOME/woboq_out:/woboq_out -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=ON" -e "WOBOQ=ON" paddle:dev
+```
+
+- You can open the generated HTML files in your Web browser. Or, if you want to run a Nginx container to serve them for a wider audience, you can run:
+
+```
+docker run -v $HOME/woboq_out:/usr/share/nginx/html -d -p 8080:80 nginx
+```
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100755
new mode 100644
index 7edba3dd09cdc594383597ac7cf7913d50e9f6e1..101b44e6c62ecf0b84d65ee7b6e90e64bd7b3272
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -1,49 +1,158 @@
 #!/bin/bash
 
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
+set -xe
 
-trap 'abort' 0
-set -e
+# Set BASE_IMAGE according to env variables
+if [ ${WITH_GPU} == "ON" ]; then
+  BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu16.04"
+else
+  BASE_IMAGE="ubuntu:16.04"
+fi
 
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+DOCKERFILE_GPU_ENV=""
+DOCKERFILE_CUDNN_DSO=""
+if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+    DOCKERFILE_CUDNN_DSO="RUN ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so.5 /usr/lib/x86_64-linux-gnu/libcudnn.so"
 fi
 
-mkdir -p /paddle/build # -p means no error if exists
+mkdir -p /paddle/build
 cd /paddle/build
+
+# build script will not fail if *.deb does not exist
+rm *.deb 2>/dev/null || true
+
+cat <<EOF
+========================================
+Configuring cmake in /paddle/build ...
+      -DCMAKE_BUILD_TYPE=Release
+      -DWITH_DOC=OFF
+      -DWITH_GPU=${WITH_GPU:-OFF}
+      -DWITH_AVX=${WITH_AVX:-OFF}
+      -DWITH_SWIG_PY=ON
+      -DCUDNN_ROOT=/usr/
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+      -DWITH_TESTING=${WITH_TESTING:-OFF}
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+========================================
+EOF
 cmake .. \
-      -DWITH_DOC=ON \
-      -DWITH_GPU=${WITH_GPU} \
-      -DWITH_AVX=${WITH_AVX} \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_DOC=OFF \
+      -DWITH_GPU=${WITH_GPU:-OFF} \
+      -DWITH_AVX=${WITH_AVX:-OFF} \
       -DWITH_SWIG_PY=ON \
       -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=OFF \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
+      -DWITH_TESTING=${WITH_TESTING:-OFF} \
       -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+
+cat <<EOF
+========================================
+Building in /paddle/build ...
+   Build unit tests: ${WITH_TESTING:-OFF}
+========================================
+EOF
 make -j `nproc`
+if [ ${WITH_TESTING:-OFF} == "ON" ] && [ ${RUN_TEST:-OFF} == "ON" ] ; then
+    pip uninstall -y py-paddle paddle || true
+    ctest -V
+fi
+
+
+cat <<EOF
+========================================
+Installing ...
+========================================
+EOF
 make install
+pip install /usr/local/opt/paddle/share/wheels/*.whl
+paddle version
 
-# Install woboq_codebrowser.
-git clone https://github.com/woboq/woboq_codebrowser /woboq
-cd /woboq
-cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-      -DCMAKE_BUILD_TYPE=Release \
-      .
-make
-
-export WOBOQ_OUT=/usr/share/nginx/html/paddle
-export BUILD_DIR=/paddle/build
-mkdir -p $WOBOQ_OUT
-cp -rv /woboq/data $WOBOQ_OUT/../data
-/woboq/generator/codebrowser_generator \
-    -b /paddle/build \
-    -a \
-    -o $WOBOQ_OUT \
-    -p paddle:/paddle
-/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-cd /woboq
-make clean
-rm -rf /paddle/build
-trap : 0
+
+# To build documentation, we need to run cmake again after installing
+# PaddlePaddle.  This awkwardness is due to
+# https://github.com/PaddlePaddle/Paddle/issues/1854.  It also
+# describes a solution.
+if [ ${WITH_DOC} == "ON" ]; then
+    cat <<EOF
+========================================
+Building documentation ...
+   In /paddle/build_doc
+========================================
+EOF
+    mkdir -p /paddle/build_doc
+    pushd /paddle/build_doc
+    cmake .. \
+          -DWITH_DOC=ON \
+          -DWITH_GPU=OFF \
+          -DWITH_AVX=${WITH_AVX:-ON} \
+          -DWITH_SWIG_PY=ON \
+          -DWITH_STYLE_CHECK=OFF
+    make paddle_docs paddle_docs_cn
+    popd
+fi
+
+
+if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
+    cat <<EOF
+========================================
+Converting C++ source code into HTML ...
+========================================
+EOF
+    export WOBOQ_OUT=/paddle/build/woboq_out
+    mkdir -p $WOBOQ_OUT
+    cp -rv /woboq/data $WOBOQ_OUT/../data
+    /woboq/generator/codebrowser_generator \
+        -b /paddle/build \
+        -a \
+        -o $WOBOQ_OUT \
+        -p paddle:/paddle
+    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+fi
+
+# generate deb package for current build
+# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
+cat <<EOF
+========================================
+Generating .deb package ...
+========================================
+EOF
+cpack -D CPACK_GENERATOR='DEB' ..
+
+
+cat <<EOF
+========================================
+Generate /paddle/build/Dockerfile ...
+========================================
+EOF
+
+cat > /paddle/build/Dockerfile <<EOF
+FROM ${BASE_IMAGE}
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+ENV HOME /root
+ENV LANG en_US.UTF-8
+# Use Fix locales to en_US.UTF-8
+EOF
+
+if [[ -n ${APT_MIRROR} ]]; then
+cat >> /paddle/build/Dockerfile <<EOF
+RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+EOF
+fi
+
+cat >> /paddle/build/Dockerfile <<EOF
+# Use different deb file when building different type of images
+ADD *.deb /
+# run paddle version to install python packages first
+RUN apt-get update &&\
+    apt-get install -y python-pip && pip install -U pip && \
+    dpkg -i /*.deb ; apt-get install -f -y && \
+    apt-get clean -y && \
+    rm -f /*.deb && \
+    paddle version
+${DOCKERFILE_CUDNN_DSO}
+${DOCKERFILE_GPU_ENV}
+# default command shows the paddle version and exit
+CMD ["paddle", "version"]
+EOF
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4629f9b9da7ababdafa0b964db18a98a819c6a9e
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.png b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..61a96d7198d013f08f0f9c269cc352da5f7dd2e9
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.graffle b/paddle/scripts/docker/doc/paddle-development-environment.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5b164c4832809de94ead7309af49c579135d7f48
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.png b/paddle/scripts/docker/doc/paddle-development-environment.png
new file mode 100644
index 0000000000000000000000000000000000000000..707ed45a335a981c23b3533984045f53848b55e2
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.png differ
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
index 87083467f50acd689ce57b86951f5f7a03c6a58b..bc194bd909aa308fd5fe920c9319f62a0ec2dac7 100755
--- a/paddle/scripts/docker/entrypoint
+++ b/paddle/scripts/docker/entrypoint
@@ -1,8 +1,4 @@
 #!/bin/bash
-LOG=/var/log/all
 
-touch $LOG
-
-/usr/sbin/sshd -D >> $LOG &
-jupyter notebook --ip=0.0.0.0 /notes/ >> $LOG &
-tail -f $LOG
+/usr/sbin/sshd -D &
+jupyter notebook --ip=0.0.0.0 /paddle/book/
diff --git a/paddle/scripts/docker/root/.bashrc b/paddle/scripts/docker/root/.bashrc
new file mode 100755
index 0000000000000000000000000000000000000000..4b3024e4e81a0fa206a796c12a8b9d72f1a8f5d9
--- /dev/null
+++ b/paddle/scripts/docker/root/.bashrc
@@ -0,0 +1,46 @@
+# Locales
+
+export LC_ALL=en_US.UTF-8
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US.UTF-8
+
+# Aliases
+
+alias rm='rm -i'
+alias cp='cp -i'
+alias mv='mv -i'
+
+alias ls='ls -hFG'
+alias l='ls -lF'
+alias ll='ls -alF'
+alias lt='ls -ltrF'
+alias ll='ls -alF'
+alias lls='ls -alSrF'
+alias llt='ls -altrF'
+
+# Colorize directory listing
+
+alias ls="ls -ph --color=auto"
+
+# Colorize grep
+
+if echo hello|grep --color=auto l >/dev/null 2>&1; then
+  export GREP_OPTIONS="--color=auto" GREP_COLOR="1;31"
+fi
+
+# Shell
+
+export CLICOLOR="1"
+
+YELLOW="\[\033[1;33m\]"
+NO_COLOUR="\[\033[0m\]"
+GREEN="\[\033[1;32m\]"
+WHITE="\[\033[1;37m\]"
+
+source ~/.scripts/git-prompt.sh
+
+export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w$YELLOW\$(__git_ps1 \" \[\033[35m\]{\[\033[36m\]%s\[\033[35m\]}\")$NO_COLOUR "
+
+# Git
+
+source ~/.scripts/git-completion.sh
diff --git a/paddle/scripts/docker/root/.gitconfig b/paddle/scripts/docker/root/.gitconfig
new file mode 100755
index 0000000000000000000000000000000000000000..6c249803a50403b9b79e36a13abe7fe88a35729d
--- /dev/null
+++ b/paddle/scripts/docker/root/.gitconfig
@@ -0,0 +1,43 @@
+[user]
+  name =
+  email =
+
+[alias]
+  st = status --branch --short
+  ci = commit
+  br = branch
+  co = checkout
+  df = diff
+  l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
+  ll = log --stat
+
+[merge]
+  tool = vimdiff
+
+[core]
+  excludesfile = ~/.gitignore
+  editor = vim
+
+[color]
+  branch = auto
+  diff = auto
+  status = auto
+
+[color "branch"]
+  current = yellow reverse
+  local = yellow
+  remote = green
+
+[color "diff"]
+  meta = yellow bold
+  frag = magenta bold
+  old = red bold
+  new = green bold
+
+[color "status"]
+  added = yellow
+  changed = green
+  untracked = cyan
+
+[push]
+  default = matching
\ No newline at end of file
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bdddef5ac2faf50b47dd03539dae8912bec8a16c
--- /dev/null
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
@@ -0,0 +1,2663 @@
+#!bash
+#
+# bash/zsh completion support for core Git.
+#
+# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
+# Conceptually based on gitcompletion (http://gitweb.hawaga.org.uk/).
+# Distributed under the GNU General Public License, version 2.0.
+#
+# The contained completion routines provide support for completing:
+#
+#    *) local and remote branch names
+#    *) local and remote tag names
+#    *) .git/remotes file names
+#    *) git 'subcommands'
+#    *) tree paths within 'ref:path/to/file' expressions
+#    *) file paths within current working directory and index
+#    *) common --long-options
+#
+# To use these routines:
+#
+#    1) Copy this file to somewhere (e.g. ~/.git-completion.sh).
+#    2) Add the following line to your .bashrc/.zshrc:
+#        source ~/.git-completion.sh
+#    3) Consider changing your PS1 to also show the current branch,
+#       see git-prompt.sh for details.
+
+case "$COMP_WORDBREAKS" in
+*:*) : great ;;
+*)   COMP_WORDBREAKS="$COMP_WORDBREAKS:"
+esac
+
+# __gitdir accepts 0 or 1 arguments (i.e., location)
+# returns location of .git repo
+__gitdir ()
+{
+  if [ -z "${1-}" ]; then
+    if [ -n "${__git_dir-}" ]; then
+      echo "$__git_dir"
+    elif [ -n "${GIT_DIR-}" ]; then
+      test -d "${GIT_DIR-}" || return 1
+      echo "$GIT_DIR"
+    elif [ -d .git ]; then
+      echo .git
+    else
+      git rev-parse --git-dir 2>/dev/null
+    fi
+  elif [ -d "$1/.git" ]; then
+    echo "$1/.git"
+  else
+    echo "$1"
+  fi
+}
+
+# The following function is based on code from:
+#
+#   bash_completion - programmable completion functions for bash 3.2+
+#
+#   Copyright © 2006-2008, Ian Macdonald <ian@caliban.org>
+#             © 2009-2010, Bash Completion Maintainers
+#                     <bash-completion-devel@lists.alioth.debian.org>
+#
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2, or (at your option)
+#   any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software Foundation,
+#   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+#   The latest version of this software can be obtained here:
+#
+#   http://bash-completion.alioth.debian.org/
+#
+#   RELEASE: 2.x
+
+# This function can be used to access a tokenized list of words
+# on the command line:
+#
+# __git_reassemble_comp_words_by_ref '=:'
+# if test "${words_[cword_-1]}" = -w
+# then
+#   ...
+# fi
+#
+# The argument should be a collection of characters from the list of
+# word completion separators (COMP_WORDBREAKS) to treat as ordinary
+# characters.
+#
+# This is roughly equivalent to going back in time and setting
+# COMP_WORDBREAKS to exclude those characters.  The intent is to
+# make option types like --date=<type> and <rev>:<path> easy to
+# recognize by treating each shell word as a single token.
+#
+# It is best not to set COMP_WORDBREAKS directly because the value is
+# shared with other completion scripts.  By the time the completion
+# function gets called, COMP_WORDS has already been populated so local
+# changes to COMP_WORDBREAKS have no effect.
+#
+# Output: words_, cword_, cur_.
+
+__git_reassemble_comp_words_by_ref()
+{
+  local exclude i j first
+  # Which word separators to exclude?
+  exclude="${1//[^$COMP_WORDBREAKS]}"
+  cword_=$COMP_CWORD
+  if [ -z "$exclude" ]; then
+    words_=("${COMP_WORDS[@]}")
+    return
+  fi
+  # List of word completion separators has shrunk;
+  # re-assemble words to complete.
+  for ((i=0, j=0; i < ${#COMP_WORDS[@]}; i++, j++)); do
+    # Append each nonempty word consisting of just
+    # word separator characters to the current word.
+    first=t
+    while
+      [ $i -gt 0 ] &&
+      [ -n "${COMP_WORDS[$i]}" ] &&
+      # word consists of excluded word separators
+      [ "${COMP_WORDS[$i]//[^$exclude]}" = "${COMP_WORDS[$i]}" ]
+    do
+      # Attach to the previous token,
+      # unless the previous token is the command name.
+      if [ $j -ge 2 ] && [ -n "$first" ]; then
+        ((j--))
+      fi
+      first=
+      words_[$j]=${words_[j]}${COMP_WORDS[i]}
+      if [ $i = $COMP_CWORD ]; then
+        cword_=$j
+      fi
+      if (($i < ${#COMP_WORDS[@]} - 1)); then
+        ((i++))
+      else
+        # Done.
+        return
+      fi
+    done
+    words_[$j]=${words_[j]}${COMP_WORDS[i]}
+    if [ $i = $COMP_CWORD ]; then
+      cword_=$j
+    fi
+  done
+}
+
+if ! type _get_comp_words_by_ref >/dev/null 2>&1; then
+_get_comp_words_by_ref ()
+{
+  local exclude cur_ words_ cword_
+  if [ "$1" = "-n" ]; then
+    exclude=$2
+    shift 2
+  fi
+  __git_reassemble_comp_words_by_ref "$exclude"
+  cur_=${words_[cword_]}
+  while [ $# -gt 0 ]; do
+    case "$1" in
+    cur)
+      cur=$cur_
+      ;;
+    prev)
+      prev=${words_[$cword_-1]}
+      ;;
+    words)
+      words=("${words_[@]}")
+      ;;
+    cword)
+      cword=$cword_
+      ;;
+    esac
+    shift
+  done
+}
+fi
+
+__gitcompadd ()
+{
+  local i=0
+  for x in $1; do
+    if [[ "$x" == "$3"* ]]; then
+      COMPREPLY[i++]="$2$x$4"
+    fi
+  done
+}
+
+# Generates completion reply, appending a space to possible completion words,
+# if necessary.
+# It accepts 1 to 4 arguments:
+# 1: List of possible completion words.
+# 2: A prefix to be added to each possible completion word (optional).
+# 3: Generate possible completion matches for this word (optional).
+# 4: A suffix to be appended to each possible completion word (optional).
+__gitcomp ()
+{
+  local cur_="${3-$cur}"
+
+  case "$cur_" in
+  --*=)
+    ;;
+  *)
+    local c i=0 IFS=$' \t\n'
+    for c in $1; do
+      c="$c${4-}"
+      if [[ $c == "$cur_"* ]]; then
+        case $c in
+        --*=*|*.) ;;
+        *) c="$c " ;;
+        esac
+        COMPREPLY[i++]="${2-}$c"
+      fi
+    done
+    ;;
+  esac
+}
+
+# Generates completion reply from newline-separated possible completion words
+# by appending a space to all of them.
+# It accepts 1 to 4 arguments:
+# 1: List of possible completion words, separated by a single newline.
+# 2: A prefix to be added to each possible completion word (optional).
+# 3: Generate possible completion matches for this word (optional).
+# 4: A suffix to be appended to each possible completion word instead of
+#    the default space (optional).  If specified but empty, nothing is
+#    appended.
+__gitcomp_nl ()
+{
+  local IFS=$'\n'
+  __gitcompadd "$1" "${2-}" "${3-$cur}" "${4- }"
+}
+
+# Generates completion reply with compgen from newline-separated possible
+# completion filenames.
+# It accepts 1 to 3 arguments:
+# 1: List of possible completion filenames, separated by a single newline.
+# 2: A directory prefix to be added to each possible completion filename
+#    (optional).
+# 3: Generate possible completion matches for this word (optional).
+__gitcomp_file ()
+{
+  local IFS=$'\n'
+
+  # XXX does not work when the directory prefix contains a tilde,
+  # since tilde expansion is not applied.
+  # This means that COMPREPLY will be empty and Bash default
+  # completion will be used.
+  __gitcompadd "$1" "${2-}" "${3-$cur}" ""
+
+  # use a hack to enable file mode in bash < 4
+  compopt -o filenames +o nospace 2>/dev/null ||
+  compgen -f /non-existing-dir/ > /dev/null
+}
+
+# Execute 'git ls-files', unless the --committable option is specified, in
+# which case it runs 'git diff-index' to find out the files that can be
+# committed.  It return paths relative to the directory specified in the first
+# argument, and using the options specified in the second argument.
+__git_ls_files_helper ()
+{
+  (
+    test -n "${CDPATH+set}" && unset CDPATH
+    cd "$1"
+    if [ "$2" == "--committable" ]; then
+      git diff-index --name-only --relative HEAD
+    else
+      # NOTE: $2 is not quoted in order to support multiple options
+      git ls-files --exclude-standard $2
+    fi
+  ) 2>/dev/null
+}
+
+
+# __git_index_files accepts 1 or 2 arguments:
+# 1: Options to pass to ls-files (required).
+# 2: A directory path (optional).
+#    If provided, only files within the specified directory are listed.
+#    Sub directories are never recursed.  Path must have a trailing
+#    slash.
+__git_index_files ()
+{
+  local dir="$(__gitdir)" root="${2-.}" file
+
+  if [ -d "$dir" ]; then
+    __git_ls_files_helper "$root" "$1" |
+    while read -r file; do
+      case "$file" in
+      ?*/*) echo "${file%%/*}" ;;
+      *) echo "$file" ;;
+      esac
+    done | sort | uniq
+  fi
+}
+
+__git_heads ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir" ]; then
+    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
+      refs/heads
+    return
+  fi
+}
+
+__git_tags ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir" ]; then
+    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
+      refs/tags
+    return
+  fi
+}
+
+# __git_refs accepts 0, 1 (to pass to __gitdir), or 2 arguments
+# presence of 2nd argument means use the guess heuristic employed
+# by checkout for tracking branches
+__git_refs ()
+{
+  local i hash dir="$(__gitdir "${1-}")" track="${2-}"
+  local format refs
+  if [ -d "$dir" ]; then
+    case "$cur" in
+    refs|refs/*)
+      format="refname"
+      refs="${cur%/*}"
+      track=""
+      ;;
+    *)
+      for i in HEAD FETCH_HEAD ORIG_HEAD MERGE_HEAD; do
+        if [ -e "$dir/$i" ]; then echo $i; fi
+      done
+      format="refname:short"
+      refs="refs/tags refs/heads refs/remotes"
+      ;;
+    esac
+    git --git-dir="$dir" for-each-ref --format="%($format)" \
+      $refs
+    if [ -n "$track" ]; then
+      # employ the heuristic used by git checkout
+      # Try to find a remote branch that matches the completion word
+      # but only output if the branch name is unique
+      local ref entry
+      git --git-dir="$dir" for-each-ref --shell --format="ref=%(refname:short)" \
+        "refs/remotes/" | \
+      while read -r entry; do
+        eval "$entry"
+        ref="${ref#*/}"
+        if [[ "$ref" == "$cur"* ]]; then
+          echo "$ref"
+        fi
+      done | sort | uniq -u
+    fi
+    return
+  fi
+  case "$cur" in
+  refs|refs/*)
+    git ls-remote "$dir" "$cur*" 2>/dev/null | \
+    while read -r hash i; do
+      case "$i" in
+      *^{}) ;;
+      *) echo "$i" ;;
+      esac
+    done
+    ;;
+  *)
+    echo "HEAD"
+    git for-each-ref --format="%(refname:short)" -- "refs/remotes/$dir/" | sed -e "s#^$dir/##"
+    ;;
+  esac
+}
+
+# __git_refs2 requires 1 argument (to pass to __git_refs)
+__git_refs2 ()
+{
+  local i
+  for i in $(__git_refs "$1"); do
+    echo "$i:$i"
+  done
+}
+
+# __git_refs_remotes requires 1 argument (to pass to ls-remote)
+__git_refs_remotes ()
+{
+  local i hash
+  git ls-remote "$1" 'refs/heads/*' 2>/dev/null | \
+  while read -r hash i; do
+    echo "$i:refs/remotes/$1/${i#refs/heads/}"
+  done
+}
+
+__git_remotes ()
+{
+  local i IFS=$'\n' d="$(__gitdir)"
+  test -d "$d/remotes" && ls -1 "$d/remotes"
+  for i in $(git --git-dir="$d" config --get-regexp 'remote\..*\.url' 2>/dev/null); do
+    i="${i#remote.}"
+    echo "${i/.url*/}"
+  done
+}
+
+__git_list_merge_strategies ()
+{
+  git merge -s help 2>&1 |
+  sed -n -e '/[Aa]vailable strategies are: /,/^$/{
+    s/\.$//
+    s/.*://
+    s/^[  ]*//
+    s/[   ]*$//
+    p
+  }'
+}
+
+__git_merge_strategies=
+# 'git merge -s help' (and thus detection of the merge strategy
+# list) fails, unfortunately, if run outside of any git working
+# tree.  __git_merge_strategies is set to the empty string in
+# that case, and the detection will be repeated the next time it
+# is needed.
+__git_compute_merge_strategies ()
+{
+  test -n "$__git_merge_strategies" ||
+  __git_merge_strategies=$(__git_list_merge_strategies)
+}
+
+__git_complete_revlist_file ()
+{
+  local pfx ls ref cur_="$cur"
+  case "$cur_" in
+  *..?*:*)
+    return
+    ;;
+  ?*:*)
+    ref="${cur_%%:*}"
+    cur_="${cur_#*:}"
+    case "$cur_" in
+    ?*/*)
+      pfx="${cur_%/*}"
+      cur_="${cur_##*/}"
+      ls="$ref:$pfx"
+      pfx="$pfx/"
+      ;;
+    *)
+      ls="$ref"
+      ;;
+    esac
+
+    case "$COMP_WORDBREAKS" in
+    *:*) : great ;;
+    *)   pfx="$ref:$pfx" ;;
+    esac
+
+    __gitcomp_nl "$(git --git-dir="$(__gitdir)" ls-tree "$ls" 2>/dev/null \
+        | sed '/^100... blob /{
+                   s,^.*  ,,
+                   s,$, ,
+               }
+               /^120000 blob /{
+                   s,^.*  ,,
+                   s,$, ,
+               }
+               /^040000 tree /{
+                   s,^.*  ,,
+                   s,$,/,
+               }
+               s/^.*  //')" \
+      "$pfx" "$cur_" ""
+    ;;
+  *...*)
+    pfx="${cur_%...*}..."
+    cur_="${cur_#*...}"
+    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    ;;
+  *..*)
+    pfx="${cur_%..*}.."
+    cur_="${cur_#*..}"
+    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+
+# __git_complete_index_file requires 1 argument:
+# 1: the options to pass to ls-file
+#
+# The exception is --committable, which finds the files appropriate commit.
+__git_complete_index_file ()
+{
+  local pfx="" cur_="$cur"
+
+  case "$cur_" in
+  ?*/*)
+    pfx="${cur_%/*}"
+    cur_="${cur_##*/}"
+    pfx="${pfx}/"
+    ;;
+  esac
+
+  __gitcomp_file "$(__git_index_files "$1" "$pfx")" "$pfx" "$cur_"
+}
+
+__git_complete_file ()
+{
+  __git_complete_revlist_file
+}
+
+__git_complete_revlist ()
+{
+  __git_complete_revlist_file
+}
+
+__git_complete_remote_or_refspec ()
+{
+  local cur_="$cur" cmd="${words[1]}"
+  local i c=2 remote="" pfx="" lhs=1 no_complete_refspec=0
+  if [ "$cmd" = "remote" ]; then
+    ((c++))
+  fi
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    --mirror) [ "$cmd" = "push" ] && no_complete_refspec=1 ;;
+    --all)
+      case "$cmd" in
+      push) no_complete_refspec=1 ;;
+      fetch)
+        return
+        ;;
+      *) ;;
+      esac
+      ;;
+    -*) ;;
+    *) remote="$i"; break ;;
+    esac
+    ((c++))
+  done
+  if [ -z "$remote" ]; then
+    __gitcomp_nl "$(__git_remotes)"
+    return
+  fi
+  if [ $no_complete_refspec = 1 ]; then
+    return
+  fi
+  [ "$remote" = "." ] && remote=
+  case "$cur_" in
+  *:*)
+    case "$COMP_WORDBREAKS" in
+    *:*) : great ;;
+    *)   pfx="${cur_%%:*}:" ;;
+    esac
+    cur_="${cur_#*:}"
+    lhs=0
+    ;;
+  +*)
+    pfx="+"
+    cur_="${cur_#+}"
+    ;;
+  esac
+  case "$cmd" in
+  fetch)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs2 "$remote")" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    fi
+    ;;
+  pull|remote)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    fi
+    ;;
+  push)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
+    fi
+    ;;
+  esac
+}
+
+__git_complete_strategy ()
+{
+  __git_compute_merge_strategies
+  case "$prev" in
+  -s|--strategy)
+    __gitcomp "$__git_merge_strategies"
+    return 0
+  esac
+  case "$cur" in
+  --strategy=*)
+    __gitcomp "$__git_merge_strategies" "" "${cur##--strategy=}"
+    return 0
+    ;;
+  esac
+  return 1
+}
+
+__git_commands () {
+  if test -n "${GIT_TESTING_COMMAND_COMPLETION:-}"
+  then
+    printf "%s" "${GIT_TESTING_COMMAND_COMPLETION}"
+  else
+    git help -a|egrep '^  [a-zA-Z0-9]'
+  fi
+}
+
+__git_list_all_commands ()
+{
+  local i IFS=" "$'\n'
+  for i in $(__git_commands)
+  do
+    case $i in
+    *--*)             : helper pattern;;
+    *) echo $i;;
+    esac
+  done
+}
+
+__git_all_commands=
+__git_compute_all_commands ()
+{
+  test -n "$__git_all_commands" ||
+  __git_all_commands=$(__git_list_all_commands)
+}
+
+__git_list_porcelain_commands ()
+{
+  local i IFS=" "$'\n'
+  __git_compute_all_commands
+  for i in $__git_all_commands
+  do
+    case $i in
+    *--*)             : helper pattern;;
+    applymbox)        : ask gittus;;
+    applypatch)       : ask gittus;;
+    archimport)       : import;;
+    cat-file)         : plumbing;;
+    check-attr)       : plumbing;;
+    check-ignore)     : plumbing;;
+    check-mailmap)    : plumbing;;
+    check-ref-format) : plumbing;;
+    checkout-index)   : plumbing;;
+    commit-tree)      : plumbing;;
+    count-objects)    : infrequent;;
+    credential-cache) : credentials helper;;
+    credential-store) : credentials helper;;
+    cvsexportcommit)  : export;;
+    cvsimport)        : import;;
+    cvsserver)        : daemon;;
+    daemon)           : daemon;;
+    diff-files)       : plumbing;;
+    diff-index)       : plumbing;;
+    diff-tree)        : plumbing;;
+    fast-import)      : import;;
+    fast-export)      : export;;
+    fsck-objects)     : plumbing;;
+    fetch-pack)       : plumbing;;
+    fmt-merge-msg)    : plumbing;;
+    for-each-ref)     : plumbing;;
+    hash-object)      : plumbing;;
+    http-*)           : transport;;
+    index-pack)       : plumbing;;
+    init-db)          : deprecated;;
+    local-fetch)      : plumbing;;
+    lost-found)       : infrequent;;
+    ls-files)         : plumbing;;
+    ls-remote)        : plumbing;;
+    ls-tree)          : plumbing;;
+    mailinfo)         : plumbing;;
+    mailsplit)        : plumbing;;
+    merge-*)          : plumbing;;
+    mktree)           : plumbing;;
+    mktag)            : plumbing;;
+    pack-objects)     : plumbing;;
+    pack-redundant)   : plumbing;;
+    pack-refs)        : plumbing;;
+    parse-remote)     : plumbing;;
+    patch-id)         : plumbing;;
+    peek-remote)      : plumbing;;
+    prune)            : plumbing;;
+    prune-packed)     : plumbing;;
+    quiltimport)      : import;;
+    read-tree)        : plumbing;;
+    receive-pack)     : plumbing;;
+    remote-*)         : transport;;
+    repo-config)      : deprecated;;
+    rerere)           : plumbing;;
+    rev-list)         : plumbing;;
+    rev-parse)        : plumbing;;
+    runstatus)        : plumbing;;
+    sh-setup)         : internal;;
+    shell)            : daemon;;
+    show-ref)         : plumbing;;
+    send-pack)        : plumbing;;
+    show-index)       : plumbing;;
+    ssh-*)            : transport;;
+    stripspace)       : plumbing;;
+    symbolic-ref)     : plumbing;;
+    tar-tree)         : deprecated;;
+    unpack-file)      : plumbing;;
+    unpack-objects)   : plumbing;;
+    update-index)     : plumbing;;
+    update-ref)       : plumbing;;
+    update-server-info) : daemon;;
+    upload-archive)   : plumbing;;
+    upload-pack)      : plumbing;;
+    write-tree)       : plumbing;;
+    var)              : infrequent;;
+    verify-pack)      : infrequent;;
+    verify-tag)       : plumbing;;
+    *) echo $i;;
+    esac
+  done
+}
+
+__git_porcelain_commands=
+__git_compute_porcelain_commands ()
+{
+  __git_compute_all_commands
+  test -n "$__git_porcelain_commands" ||
+  __git_porcelain_commands=$(__git_list_porcelain_commands)
+}
+
+__git_pretty_aliases ()
+{
+  local i IFS=$'\n'
+  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "pretty\..*" 2>/dev/null); do
+    case "$i" in
+    pretty.*)
+      i="${i#pretty.}"
+      echo "${i/ */}"
+      ;;
+    esac
+  done
+}
+
+__git_aliases ()
+{
+  local i IFS=$'\n'
+  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "alias\..*" 2>/dev/null); do
+    case "$i" in
+    alias.*)
+      i="${i#alias.}"
+      echo "${i/ */}"
+      ;;
+    esac
+  done
+}
+
+# __git_aliased_command requires 1 argument
+__git_aliased_command ()
+{
+  local word cmdline=$(git --git-dir="$(__gitdir)" \
+    config --get "alias.$1")
+  for word in $cmdline; do
+    case "$word" in
+    \!gitk|gitk)
+      echo "gitk"
+      return
+      ;;
+    \!*)  : shell command alias ;;
+    -*) : option ;;
+    *=*)  : setting env ;;
+    git)  : git itself ;;
+    *)
+      echo "$word"
+      return
+    esac
+  done
+}
+
+# __git_find_on_cmdline requires 1 argument
+__git_find_on_cmdline ()
+{
+  local word subcommand c=1
+  while [ $c -lt $cword ]; do
+    word="${words[c]}"
+    for subcommand in $1; do
+      if [ "$subcommand" = "$word" ]; then
+        echo "$subcommand"
+        return
+      fi
+    done
+    ((c++))
+  done
+}
+
+__git_has_doubledash ()
+{
+  local c=1
+  while [ $c -lt $cword ]; do
+    if [ "--" = "${words[c]}" ]; then
+      return 0
+    fi
+    ((c++))
+  done
+  return 1
+}
+
+# Try to count non option arguments passed on the command line for the
+# specified git command.
+# When options are used, it is necessary to use the special -- option to
+# tell the implementation were non option arguments begin.
+# XXX this can not be improved, since options can appear everywhere, as
+# an example:
+# git mv x -n y
+#
+# __git_count_arguments requires 1 argument: the git command executed.
+__git_count_arguments ()
+{
+  local word i c=0
+
+  # Skip "git" (first argument)
+  for ((i=1; i < ${#words[@]}; i++)); do
+    word="${words[i]}"
+
+    case "$word" in
+      --)
+        # Good; we can assume that the following are only non
+        # option arguments.
+        ((c = 0))
+        ;;
+      "$1")
+        # Skip the specified git command and discard git
+        # main options
+        ((c = 0))
+        ;;
+      ?*)
+        ((c++))
+        ;;
+    esac
+  done
+
+  printf "%d" $c
+}
+
+__git_whitespacelist="nowarn warn error error-all fix"
+
+_git_am ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir"/rebase-apply ]; then
+    __gitcomp "--skip --continue --resolved --abort"
+    return
+  fi
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --3way --committer-date-is-author-date --ignore-date
+      --ignore-whitespace --ignore-space-change
+      --interactive --keep --no-utf8 --signoff --utf8
+      --whitespace= --scissors
+      "
+    return
+  esac
+}
+
+_git_apply ()
+{
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --stat --numstat --summary --check --index
+      --cached --index-info --reverse --reject --unidiff-zero
+      --apply --no-add --exclude=
+      --ignore-whitespace --ignore-space-change
+      --whitespace= --inaccurate-eof --verbose
+      "
+    return
+  esac
+}
+
+_git_add ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --interactive --refresh --patch --update --dry-run
+      --ignore-errors --intent-to-add
+      "
+    return
+  esac
+
+  # XXX should we check for --update and --all options ?
+  __git_complete_index_file "--others --modified"
+}
+
+_git_archive ()
+{
+  case "$cur" in
+  --format=*)
+    __gitcomp "$(git archive --list)" "" "${cur##--format=}"
+    return
+    ;;
+  --remote=*)
+    __gitcomp_nl "$(__git_remotes)" "" "${cur##--remote=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --format= --list --verbose
+      --prefix= --remote= --exec=
+      "
+    return
+    ;;
+  esac
+  __git_complete_file
+}
+
+_git_bisect ()
+{
+  __git_has_doubledash && return
+
+  local subcommands="start bad good skip reset visualize replay log run"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    if [ -f "$(__gitdir)"/BISECT_START ]; then
+      __gitcomp "$subcommands"
+    else
+      __gitcomp "replay start"
+    fi
+    return
+  fi
+
+  case "$subcommand" in
+  bad|good|reset|skip|start)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  *)
+    ;;
+  esac
+}
+
+_git_branch ()
+{
+  local i c=1 only_local_ref="n" has_r="n"
+
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    -d|-m)  only_local_ref="y" ;;
+    -r) has_r="y" ;;
+    esac
+    ((c++))
+  done
+
+  case "$cur" in
+  --set-upstream-to=*)
+    __gitcomp "$(__git_refs)" "" "${cur##--set-upstream-to=}"
+    ;;
+  --*)
+    __gitcomp "
+      --color --no-color --verbose --abbrev= --no-abbrev
+      --track --no-track --contains --merged --no-merged
+      --set-upstream-to= --edit-description --list
+      --unset-upstream
+      "
+    ;;
+  *)
+    if [ $only_local_ref = "y" -a $has_r = "n" ]; then
+      __gitcomp_nl "$(__git_heads)"
+    else
+      __gitcomp_nl "$(__git_refs)"
+    fi
+    ;;
+  esac
+}
+
+_git_bundle ()
+{
+  local cmd="${words[2]}"
+  case "$cword" in
+  2)
+    __gitcomp "create list-heads verify unbundle"
+    ;;
+  3)
+    # looking for a file
+    ;;
+  *)
+    case "$cmd" in
+      create)
+        __git_complete_revlist
+      ;;
+    esac
+    ;;
+  esac
+}
+
+_git_checkout ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --conflict=*)
+    __gitcomp "diff3 merge" "" "${cur##--conflict=}"
+    ;;
+  --*)
+    __gitcomp "
+      --quiet --ours --theirs --track --no-track --merge
+      --conflict= --orphan --patch
+      "
+    ;;
+  *)
+    # check if --track, --no-track, or --no-guess was specified
+    # if so, disable DWIM mode
+    local flags="--track --no-track --no-guess" track=1
+    if [ -n "$(__git_find_on_cmdline "$flags")" ]; then
+      track=''
+    fi
+    __gitcomp_nl "$(__git_refs '' $track)"
+    ;;
+  esac
+}
+
+_git_cherry ()
+{
+  __gitcomp "$(__git_refs)"
+}
+
+_git_cherry_pick ()
+{
+  local dir="$(__gitdir)"
+  if [ -f "$dir"/CHERRY_PICK_HEAD ]; then
+    __gitcomp "--continue --quit --abort"
+    return
+  fi
+  case "$cur" in
+  --*)
+    __gitcomp "--edit --no-commit --signoff --strategy= --mainline"
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+_git_clean ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--dry-run --quiet"
+    return
+    ;;
+  esac
+
+  # XXX should we check for -x option ?
+  __git_complete_index_file "--others"
+}
+
+_git_clone ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --local
+      --no-hardlinks
+      --shared
+      --reference
+      --quiet
+      --no-checkout
+      --bare
+      --mirror
+      --origin
+      --upload-pack
+      --template=
+      --depth
+      --single-branch
+      --branch
+      "
+    return
+    ;;
+  esac
+}
+
+_git_commit ()
+{
+  case "$prev" in
+  -c|-C)
+    __gitcomp_nl "$(__git_refs)" "" "${cur}"
+    return
+    ;;
+  esac
+
+  case "$cur" in
+  --cleanup=*)
+    __gitcomp "default strip verbatim whitespace
+      " "" "${cur##--cleanup=}"
+    return
+    ;;
+  --reuse-message=*|--reedit-message=*|\
+  --fixup=*|--squash=*)
+    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
+    return
+    ;;
+  --untracked-files=*)
+    __gitcomp "all no normal" "" "${cur##--untracked-files=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --all --author= --signoff --verify --no-verify
+      --edit --no-edit
+      --amend --include --only --interactive
+      --dry-run --reuse-message= --reedit-message=
+      --reset-author --file= --message= --template=
+      --cleanup= --untracked-files --untracked-files=
+      --verbose --quiet --fixup= --squash=
+      "
+    return
+  esac
+
+  if git rev-parse --verify --quiet HEAD >/dev/null; then
+    __git_complete_index_file "--committable"
+  else
+    # This is the first commit
+    __git_complete_index_file "--cached"
+  fi
+}
+
+_git_describe ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --all --tags --contains --abbrev= --candidates=
+      --exact-match --debug --long --match --always
+      "
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+__git_diff_algorithms="myers minimal patience histogram"
+
+__git_diff_common_options="--stat --numstat --shortstat --summary
+      --patch-with-stat --name-only --name-status --color
+      --no-color --color-words --no-renames --check
+      --full-index --binary --abbrev --diff-filter=
+      --find-copies-harder
+      --text --ignore-space-at-eol --ignore-space-change
+      --ignore-all-space --exit-code --quiet --ext-diff
+      --no-ext-diff
+      --no-prefix --src-prefix= --dst-prefix=
+      --inter-hunk-context=
+      --patience --histogram --minimal
+      --raw --word-diff
+      --dirstat --dirstat= --dirstat-by-file
+      --dirstat-by-file= --cumulative
+      --diff-algorithm=
+"
+
+_git_diff ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --diff-algorithm=*)
+    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
+      --base --ours --theirs --no-index
+      $__git_diff_common_options
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+__git_mergetools_common="diffuse ecmerge emerge kdiff3 meld opendiff
+      tkdiff vimdiff gvimdiff xxdiff araxis p4merge bc3 codecompare
+"
+
+_git_difftool ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --tool=*)
+    __gitcomp "$__git_mergetools_common kompare" "" "${cur##--tool=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
+      --base --ours --theirs
+      --no-renames --diff-filter= --find-copies-harder
+      --relative --ignore-submodules
+      --tool="
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+__git_fetch_options="
+  --quiet --verbose --append --upload-pack --force --keep --depth=
+  --tags --no-tags --all --prune --dry-run
+"
+
+_git_fetch ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "$__git_fetch_options"
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+__git_format_patch_options="
+  --stdout --attach --no-attach --thread --thread= --no-thread
+  --numbered --start-number --numbered-files --keep-subject --signoff
+  --signature --no-signature --in-reply-to= --cc= --full-index --binary
+  --not --all --cover-letter --no-prefix --src-prefix= --dst-prefix=
+  --inline --suffix= --ignore-if-in-upstream --subject-prefix=
+  --output-directory --reroll-count --to= --quiet --notes
+"
+
+_git_format_patch ()
+{
+  case "$cur" in
+  --thread=*)
+    __gitcomp "
+      deep shallow
+      " "" "${cur##--thread=}"
+    return
+    ;;
+  --*)
+    __gitcomp "$__git_format_patch_options"
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_fsck ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --tags --root --unreachable --cache --no-reflogs --full
+      --strict --verbose --lost-found
+      "
+    return
+    ;;
+  esac
+}
+
+_git_gc ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--prune --aggressive"
+    return
+    ;;
+  esac
+}
+
+_git_gitk ()
+{
+  _gitk
+}
+
+__git_match_ctag() {
+  awk "/^${1////\\/}/ { print \$1 }" "$2"
+}
+
+_git_grep ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --cached
+      --text --ignore-case --word-regexp --invert-match
+      --full-name --line-number
+      --extended-regexp --basic-regexp --fixed-strings
+      --perl-regexp
+      --files-with-matches --name-only
+      --files-without-match
+      --max-depth
+      --count
+      --and --or --not --all-match
+      "
+    return
+    ;;
+  esac
+
+  case "$cword,$prev" in
+  2,*|*,-*)
+    if test -r tags; then
+      __gitcomp_nl "$(__git_match_ctag "$cur" tags)"
+      return
+    fi
+    ;;
+  esac
+
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_help ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--all --info --man --web"
+    return
+    ;;
+  esac
+  __git_compute_all_commands
+  __gitcomp "$__git_all_commands $(__git_aliases)
+    attributes cli core-tutorial cvs-migration
+    diffcore gitk glossary hooks ignore modules
+    namespaces repository-layout tutorial tutorial-2
+    workflows
+    "
+}
+
+_git_init ()
+{
+  case "$cur" in
+  --shared=*)
+    __gitcomp "
+      false true umask group all world everybody
+      " "" "${cur##--shared=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--quiet --bare --template= --shared --shared="
+    return
+    ;;
+  esac
+}
+
+_git_ls_files ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--cached --deleted --modified --others --ignored
+      --stage --directory --no-empty-directory --unmerged
+      --killed --exclude= --exclude-from=
+      --exclude-per-directory= --exclude-standard
+      --error-unmatch --with-tree= --full-name
+      --abbrev --ignored --exclude-per-directory
+      "
+    return
+    ;;
+  esac
+
+  # XXX ignore options like --modified and always suggest all cached
+  # files.
+  __git_complete_index_file "--cached"
+}
+
+_git_ls_remote ()
+{
+  __gitcomp_nl "$(__git_remotes)"
+}
+
+_git_ls_tree ()
+{
+  __git_complete_file
+}
+
+# Options that go well for log, shortlog and gitk
+__git_log_common_options="
+  --not --all
+  --branches --tags --remotes
+  --first-parent --merges --no-merges
+  --max-count=
+  --max-age= --since= --after=
+  --min-age= --until= --before=
+  --min-parents= --max-parents=
+  --no-min-parents --no-max-parents
+"
+# Options that go well for log and gitk (not shortlog)
+__git_log_gitk_options="
+  --dense --sparse --full-history
+  --simplify-merges --simplify-by-decoration
+  --left-right --notes --no-notes
+"
+# Options that go well for log and shortlog (not gitk)
+__git_log_shortlog_options="
+  --author= --committer= --grep=
+  --all-match
+"
+
+__git_log_pretty_formats="oneline short medium full fuller email raw format:"
+__git_log_date_formats="relative iso8601 rfc2822 short local default raw"
+
+_git_log ()
+{
+  __git_has_doubledash && return
+
+  local g="$(git rev-parse --git-dir 2>/dev/null)"
+  local merge=""
+  if [ -f "$g/MERGE_HEAD" ]; then
+    merge="--merge"
+  fi
+  case "$cur" in
+  --pretty=*|--format=*)
+    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
+      " "" "${cur#*=}"
+    return
+    ;;
+  --date=*)
+    __gitcomp "$__git_log_date_formats" "" "${cur##--date=}"
+    return
+    ;;
+  --decorate=*)
+    __gitcomp "long short" "" "${cur##--decorate=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_shortlog_options
+      $__git_log_gitk_options
+      --root --topo-order --date-order --reverse
+      --follow --full-diff
+      --abbrev-commit --abbrev=
+      --relative-date --date=
+      --pretty= --format= --oneline
+      --cherry-pick
+      --graph
+      --decorate --decorate=
+      --walk-reflogs
+      --parents --children
+      $merge
+      $__git_diff_common_options
+      --pickaxe-all --pickaxe-regex
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+__git_merge_options="
+  --no-commit --no-stat --log --no-log --squash --strategy
+  --commit --stat --no-squash --ff --no-ff --ff-only --edit --no-edit
+"
+
+_git_merge ()
+{
+  __git_complete_strategy && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "$__git_merge_options"
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_mergetool ()
+{
+  case "$cur" in
+  --tool=*)
+    __gitcomp "$__git_mergetools_common tortoisemerge" "" "${cur##--tool=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--tool="
+    return
+    ;;
+  esac
+}
+
+_git_merge_base ()
+{
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_mv ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--dry-run"
+    return
+    ;;
+  esac
+
+  if [ $(__git_count_arguments "mv") -gt 0 ]; then
+    # We need to show both cached and untracked files (including
+    # empty directories) since this may not be the last argument.
+    __git_complete_index_file "--cached --others --directory"
+  else
+    __git_complete_index_file "--cached"
+  fi
+}
+
+_git_name_rev ()
+{
+  __gitcomp "--tags --all --stdin"
+}
+
+_git_notes ()
+{
+  local subcommands='add append copy edit list prune remove show'
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+
+  case "$subcommand,$cur" in
+  ,--*)
+    __gitcomp '--ref'
+    ;;
+  ,*)
+    case "$prev" in
+    --ref)
+      __gitcomp_nl "$(__git_refs)"
+      ;;
+    *)
+      __gitcomp "$subcommands --ref"
+      ;;
+    esac
+    ;;
+  add,--reuse-message=*|append,--reuse-message=*|\
+  add,--reedit-message=*|append,--reedit-message=*)
+    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
+    ;;
+  add,--*|append,--*)
+    __gitcomp '--file= --message= --reedit-message=
+        --reuse-message='
+    ;;
+  copy,--*)
+    __gitcomp '--stdin'
+    ;;
+  prune,--*)
+    __gitcomp '--dry-run --verbose'
+    ;;
+  prune,*)
+    ;;
+  *)
+    case "$prev" in
+    -m|-F)
+      ;;
+    *)
+      __gitcomp_nl "$(__git_refs)"
+      ;;
+    esac
+    ;;
+  esac
+}
+
+_git_pull ()
+{
+  __git_complete_strategy && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --rebase --no-rebase
+      $__git_merge_options
+      $__git_fetch_options
+    "
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+_git_push ()
+{
+  case "$prev" in
+  --repo)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+  esac
+  case "$cur" in
+  --repo=*)
+    __gitcomp_nl "$(__git_remotes)" "" "${cur##--repo=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --all --mirror --tags --dry-run --force --verbose
+      --receive-pack= --repo= --set-upstream
+    "
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+_git_rebase ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir"/rebase-apply ] || [ -d "$dir"/rebase-merge ]; then
+    __gitcomp "--continue --skip --abort"
+    return
+  fi
+  __git_complete_strategy && return
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --onto --merge --strategy --interactive
+      --preserve-merges --stat --no-stat
+      --committer-date-is-author-date --ignore-date
+      --ignore-whitespace --whitespace=
+      --autosquash
+      "
+
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_reflog ()
+{
+  local subcommands="show delete expire"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+  else
+    __gitcomp_nl "$(__git_refs)"
+  fi
+}
+
+__git_send_email_confirm_options="always never auto cc compose"
+__git_send_email_suppresscc_options="author self cc bodycc sob cccmd body all"
+
+_git_send_email ()
+{
+  case "$cur" in
+  --confirm=*)
+    __gitcomp "
+      $__git_send_email_confirm_options
+      " "" "${cur##--confirm=}"
+    return
+    ;;
+  --suppress-cc=*)
+    __gitcomp "
+      $__git_send_email_suppresscc_options
+      " "" "${cur##--suppress-cc=}"
+
+    return
+    ;;
+  --smtp-encryption=*)
+    __gitcomp "ssl tls" "" "${cur##--smtp-encryption=}"
+    return
+    ;;
+  --thread=*)
+    __gitcomp "
+      deep shallow
+      " "" "${cur##--thread=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--annotate --bcc --cc --cc-cmd --chain-reply-to
+      --compose --confirm= --dry-run --envelope-sender
+      --from --identity
+      --in-reply-to --no-chain-reply-to --no-signed-off-by-cc
+      --no-suppress-from --no-thread --quiet
+      --signed-off-by-cc --smtp-pass --smtp-server
+      --smtp-server-port --smtp-encryption= --smtp-user
+      --subject --suppress-cc= --suppress-from --thread --to
+      --validate --no-validate
+      $__git_format_patch_options"
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_stage ()
+{
+  _git_add
+}
+
+__git_config_get_set_variables ()
+{
+  local prevword word config_file= c=$cword
+  while [ $c -gt 1 ]; do
+    word="${words[c]}"
+    case "$word" in
+    --system|--global|--local|--file=*)
+      config_file="$word"
+      break
+      ;;
+    -f|--file)
+      config_file="$word $prevword"
+      break
+      ;;
+    esac
+    prevword=$word
+    c=$((--c))
+  done
+
+  git --git-dir="$(__gitdir)" config $config_file --list 2>/dev/null |
+  while read -r line
+  do
+    case "$line" in
+    *.*=*)
+      echo "${line/=*/}"
+      ;;
+    esac
+  done
+}
+
+_git_config ()
+{
+  case "$prev" in
+  branch.*.remote|branch.*.pushremote)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+    ;;
+  branch.*.merge)
+    __gitcomp_nl "$(__git_refs)"
+    return
+    ;;
+  branch.*.rebase)
+    __gitcomp "false true"
+    return
+    ;;
+  remote.pushdefault)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+    ;;
+  remote.*.fetch)
+    local remote="${prev#remote.}"
+    remote="${remote%.fetch}"
+    if [ -z "$cur" ]; then
+      __gitcomp_nl "refs/heads/" "" "" ""
+      return
+    fi
+    __gitcomp_nl "$(__git_refs_remotes "$remote")"
+    return
+    ;;
+  remote.*.push)
+    local remote="${prev#remote.}"
+    remote="${remote%.push}"
+    __gitcomp_nl "$(git --git-dir="$(__gitdir)" \
+      for-each-ref --format='%(refname):%(refname)' \
+      refs/heads)"
+    return
+    ;;
+  pull.twohead|pull.octopus)
+    __git_compute_merge_strategies
+    __gitcomp "$__git_merge_strategies"
+    return
+    ;;
+  color.branch|color.diff|color.interactive|\
+  color.showbranch|color.status|color.ui)
+    __gitcomp "always never auto"
+    return
+    ;;
+  color.pager)
+    __gitcomp "false true"
+    return
+    ;;
+  color.*.*)
+    __gitcomp "
+      normal black red green yellow blue magenta cyan white
+      bold dim ul blink reverse
+      "
+    return
+    ;;
+  diff.submodule)
+    __gitcomp "log short"
+    return
+    ;;
+  help.format)
+    __gitcomp "man info web html"
+    return
+    ;;
+  log.date)
+    __gitcomp "$__git_log_date_formats"
+    return
+    ;;
+  sendemail.aliasesfiletype)
+    __gitcomp "mutt mailrc pine elm gnus"
+    return
+    ;;
+  sendemail.confirm)
+    __gitcomp "$__git_send_email_confirm_options"
+    return
+    ;;
+  sendemail.suppresscc)
+    __gitcomp "$__git_send_email_suppresscc_options"
+    return
+    ;;
+  --get|--get-all|--unset|--unset-all)
+    __gitcomp_nl "$(__git_config_get_set_variables)"
+    return
+    ;;
+  *.*)
+    return
+    ;;
+  esac
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --system --global --local --file=
+      --list --replace-all
+      --get --get-all --get-regexp
+      --add --unset --unset-all
+      --remove-section --rename-section
+      "
+    return
+    ;;
+  branch.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "remote pushremote merge mergeoptions rebase" "$pfx" "$cur_"
+    return
+    ;;
+  branch.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __gitcomp_nl "$(__git_heads)" "$pfx" "$cur_" "."
+    return
+    ;;
+  guitool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "
+      argprompt cmd confirm needsfile noconsole norescan
+      prompt revprompt revunmerged title
+      " "$pfx" "$cur_"
+    return
+    ;;
+  difftool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path" "$pfx" "$cur_"
+    return
+    ;;
+  man.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path" "$pfx" "$cur_"
+    return
+    ;;
+  mergetool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path trustExitCode" "$pfx" "$cur_"
+    return
+    ;;
+  pager.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __git_compute_all_commands
+    __gitcomp_nl "$__git_all_commands" "$pfx" "$cur_"
+    return
+    ;;
+  remote.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "
+      url proxy fetch push mirror skipDefaultUpdate
+      receivepack uploadpack tagopt pushurl
+      " "$pfx" "$cur_"
+    return
+    ;;
+  remote.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __gitcomp_nl "$(__git_remotes)" "$pfx" "$cur_" "."
+    return
+    ;;
+  url.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "insteadOf pushInsteadOf" "$pfx" "$cur_"
+    return
+    ;;
+  esac
+  __gitcomp "
+    add.ignoreErrors
+    advice.commitBeforeMerge
+    advice.detachedHead
+    advice.implicitIdentity
+    advice.pushNonFastForward
+    advice.resolveConflict
+    advice.statusHints
+    alias.
+    am.keepcr
+    apply.ignorewhitespace
+    apply.whitespace
+    branch.autosetupmerge
+    branch.autosetuprebase
+    browser.
+    clean.requireForce
+    color.branch
+    color.branch.current
+    color.branch.local
+    color.branch.plain
+    color.branch.remote
+    color.decorate.HEAD
+    color.decorate.branch
+    color.decorate.remoteBranch
+    color.decorate.stash
+    color.decorate.tag
+    color.diff
+    color.diff.commit
+    color.diff.frag
+    color.diff.func
+    color.diff.meta
+    color.diff.new
+    color.diff.old
+    color.diff.plain
+    color.diff.whitespace
+    color.grep
+    color.grep.context
+    color.grep.filename
+    color.grep.function
+    color.grep.linenumber
+    color.grep.match
+    color.grep.selected
+    color.grep.separator
+    color.interactive
+    color.interactive.error
+    color.interactive.header
+    color.interactive.help
+    color.interactive.prompt
+    color.pager
+    color.showbranch
+    color.status
+    color.status.added
+    color.status.changed
+    color.status.header
+    color.status.nobranch
+    color.status.untracked
+    color.status.updated
+    color.ui
+    commit.status
+    commit.template
+    core.abbrev
+    core.askpass
+    core.attributesfile
+    core.autocrlf
+    core.bare
+    core.bigFileThreshold
+    core.compression
+    core.createObject
+    core.deltaBaseCacheLimit
+    core.editor
+    core.eol
+    core.excludesfile
+    core.fileMode
+    core.fsyncobjectfiles
+    core.gitProxy
+    core.ignoreStat
+    core.ignorecase
+    core.logAllRefUpdates
+    core.loosecompression
+    core.notesRef
+    core.packedGitLimit
+    core.packedGitWindowSize
+    core.pager
+    core.preferSymlinkRefs
+    core.preloadindex
+    core.quotepath
+    core.repositoryFormatVersion
+    core.safecrlf
+    core.sharedRepository
+    core.sparseCheckout
+    core.symlinks
+    core.trustctime
+    core.warnAmbiguousRefs
+    core.whitespace
+    core.worktree
+    diff.autorefreshindex
+    diff.external
+    diff.ignoreSubmodules
+    diff.mnemonicprefix
+    diff.noprefix
+    diff.renameLimit
+    diff.renames
+    diff.statGraphWidth
+    diff.submodule
+    diff.suppressBlankEmpty
+    diff.tool
+    diff.wordRegex
+    diff.algorithm
+    difftool.
+    difftool.prompt
+    fetch.recurseSubmodules
+    fetch.unpackLimit
+    format.attach
+    format.cc
+    format.headers
+    format.numbered
+    format.pretty
+    format.signature
+    format.signoff
+    format.subjectprefix
+    format.suffix
+    format.thread
+    format.to
+    gc.
+    gc.aggressiveWindow
+    gc.auto
+    gc.autopacklimit
+    gc.packrefs
+    gc.pruneexpire
+    gc.reflogexpire
+    gc.reflogexpireunreachable
+    gc.rerereresolved
+    gc.rerereunresolved
+    gitcvs.allbinary
+    gitcvs.commitmsgannotation
+    gitcvs.dbTableNamePrefix
+    gitcvs.dbdriver
+    gitcvs.dbname
+    gitcvs.dbpass
+    gitcvs.dbuser
+    gitcvs.enabled
+    gitcvs.logfile
+    gitcvs.usecrlfattr
+    guitool.
+    gui.blamehistoryctx
+    gui.commitmsgwidth
+    gui.copyblamethreshold
+    gui.diffcontext
+    gui.encoding
+    gui.fastcopyblame
+    gui.matchtrackingbranch
+    gui.newbranchtemplate
+    gui.pruneduringfetch
+    gui.spellingdictionary
+    gui.trustmtime
+    help.autocorrect
+    help.browser
+    help.format
+    http.lowSpeedLimit
+    http.lowSpeedTime
+    http.maxRequests
+    http.minSessions
+    http.noEPSV
+    http.postBuffer
+    http.proxy
+    http.sslCAInfo
+    http.sslCAPath
+    http.sslCert
+    http.sslCertPasswordProtected
+    http.sslKey
+    http.sslVerify
+    http.useragent
+    i18n.commitEncoding
+    i18n.logOutputEncoding
+    imap.authMethod
+    imap.folder
+    imap.host
+    imap.pass
+    imap.port
+    imap.preformattedHTML
+    imap.sslverify
+    imap.tunnel
+    imap.user
+    init.templatedir
+    instaweb.browser
+    instaweb.httpd
+    instaweb.local
+    instaweb.modulepath
+    instaweb.port
+    interactive.singlekey
+    log.date
+    log.decorate
+    log.showroot
+    mailmap.file
+    man.
+    man.viewer
+    merge.
+    merge.conflictstyle
+    merge.log
+    merge.renameLimit
+    merge.renormalize
+    merge.stat
+    merge.tool
+    merge.verbosity
+    mergetool.
+    mergetool.keepBackup
+    mergetool.keepTemporaries
+    mergetool.prompt
+    notes.displayRef
+    notes.rewrite.
+    notes.rewrite.amend
+    notes.rewrite.rebase
+    notes.rewriteMode
+    notes.rewriteRef
+    pack.compression
+    pack.deltaCacheLimit
+    pack.deltaCacheSize
+    pack.depth
+    pack.indexVersion
+    pack.packSizeLimit
+    pack.threads
+    pack.window
+    pack.windowMemory
+    pager.
+    pretty.
+    pull.octopus
+    pull.twohead
+    push.default
+    rebase.autosquash
+    rebase.stat
+    receive.autogc
+    receive.denyCurrentBranch
+    receive.denyDeleteCurrent
+    receive.denyDeletes
+    receive.denyNonFastForwards
+    receive.fsckObjects
+    receive.unpackLimit
+    receive.updateserverinfo
+    remote.pushdefault
+    remotes.
+    repack.usedeltabaseoffset
+    rerere.autoupdate
+    rerere.enabled
+    sendemail.
+    sendemail.aliasesfile
+    sendemail.aliasfiletype
+    sendemail.bcc
+    sendemail.cc
+    sendemail.cccmd
+    sendemail.chainreplyto
+    sendemail.confirm
+    sendemail.envelopesender
+    sendemail.from
+    sendemail.identity
+    sendemail.multiedit
+    sendemail.signedoffbycc
+    sendemail.smtpdomain
+    sendemail.smtpencryption
+    sendemail.smtppass
+    sendemail.smtpserver
+    sendemail.smtpserveroption
+    sendemail.smtpserverport
+    sendemail.smtpuser
+    sendemail.suppresscc
+    sendemail.suppressfrom
+    sendemail.thread
+    sendemail.to
+    sendemail.validate
+    showbranch.default
+    status.relativePaths
+    status.showUntrackedFiles
+    status.submodulesummary
+    submodule.
+    tar.umask
+    transfer.unpackLimit
+    url.
+    user.email
+    user.name
+    user.signingkey
+    web.browser
+    branch. remote.
+  "
+}
+
+_git_remote ()
+{
+  local subcommands="add rename remove set-head set-branches set-url show prune update"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+    return
+  fi
+
+  case "$subcommand" in
+  rename|remove|set-url|show|prune)
+    __gitcomp_nl "$(__git_remotes)"
+    ;;
+  set-head|set-branches)
+    __git_complete_remote_or_refspec
+    ;;
+  update)
+    local i c='' IFS=$'\n'
+    for i in $(git --git-dir="$(__gitdir)" config --get-regexp "remotes\..*" 2>/dev/null); do
+      i="${i#remotes.}"
+      c="$c ${i/ */}"
+    done
+    __gitcomp "$c"
+    ;;
+  *)
+    ;;
+  esac
+}
+
+_git_replace ()
+{
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_reset ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "--merge --mixed --hard --soft --patch"
+    return
+    ;;
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_revert ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--edit --mainline --no-edit --no-commit --signoff"
+    return
+    ;;
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_rm ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--cached --dry-run --ignore-unmatch --quiet"
+    return
+    ;;
+  esac
+
+  __git_complete_index_file "--cached"
+}
+
+_git_shortlog ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_shortlog_options
+      --numbered --summary
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_show ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --pretty=*|--format=*)
+    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
+      " "" "${cur#*=}"
+    return
+    ;;
+  --diff-algorithm=*)
+    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--pretty= --format= --abbrev-commit --oneline
+      $__git_diff_common_options
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+_git_show_branch ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --all --remotes --topo-order --current --more=
+      --list --independent --merge-base --no-name
+      --color --no-color
+      --sha1-name --sparse --topics --reflog
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_stash ()
+{
+  local save_opts='--keep-index --no-keep-index --quiet --patch'
+  local subcommands='save list show apply clear drop pop create branch'
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    case "$cur" in
+    --*)
+      __gitcomp "$save_opts"
+      ;;
+    *)
+      if [ -z "$(__git_find_on_cmdline "$save_opts")" ]; then
+        __gitcomp "$subcommands"
+      fi
+      ;;
+    esac
+  else
+    case "$subcommand,$cur" in
+    save,--*)
+      __gitcomp "$save_opts"
+      ;;
+    apply,--*|pop,--*)
+      __gitcomp "--index --quiet"
+      ;;
+    show,--*|drop,--*|branch,--*)
+      ;;
+    show,*|apply,*|drop,*|pop,*|branch,*)
+      __gitcomp_nl "$(git --git-dir="$(__gitdir)" stash list \
+          | sed -n -e 's/:.*//p')"
+      ;;
+    *)
+      ;;
+    esac
+  fi
+}
+
+_git_submodule ()
+{
+  __git_has_doubledash && return
+
+  local subcommands="add status init deinit update summary foreach sync"
+  if [ -z "$(__git_find_on_cmdline "$subcommands")" ]; then
+    case "$cur" in
+    --*)
+      __gitcomp "--quiet --cached"
+      ;;
+    *)
+      __gitcomp "$subcommands"
+      ;;
+    esac
+    return
+  fi
+}
+
+_git_svn ()
+{
+  local subcommands="
+    init fetch clone rebase dcommit log find-rev
+    set-tree commit-diff info create-ignore propget
+    proplist show-ignore show-externals branch tag blame
+    migrate mkdirs reset gc
+    "
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+  else
+    local remote_opts="--username= --config-dir= --no-auth-cache"
+    local fc_opts="
+      --follow-parent --authors-file= --repack=
+      --no-metadata --use-svm-props --use-svnsync-props
+      --log-window-size= --no-checkout --quiet
+      --repack-flags --use-log-author --localtime
+      --ignore-paths= --include-paths= $remote_opts
+      "
+    local init_opts="
+      --template= --shared= --trunk= --tags=
+      --branches= --stdlayout --minimize-url
+      --no-metadata --use-svm-props --use-svnsync-props
+      --rewrite-root= --prefix= --use-log-author
+      --add-author-from $remote_opts
+      "
+    local cmt_opts="
+      --edit --rmdir --find-copies-harder --copy-similarity=
+      "
+
+    case "$subcommand,$cur" in
+    fetch,--*)
+      __gitcomp "--revision= --fetch-all $fc_opts"
+      ;;
+    clone,--*)
+      __gitcomp "--revision= $fc_opts $init_opts"
+      ;;
+    init,--*)
+      __gitcomp "$init_opts"
+      ;;
+    dcommit,--*)
+      __gitcomp "
+        --merge --strategy= --verbose --dry-run
+        --fetch-all --no-rebase --commit-url
+        --revision --interactive $cmt_opts $fc_opts
+        "
+      ;;
+    set-tree,--*)
+      __gitcomp "--stdin $cmt_opts $fc_opts"
+      ;;
+    create-ignore,--*|propget,--*|proplist,--*|show-ignore,--*|\
+    show-externals,--*|mkdirs,--*)
+      __gitcomp "--revision="
+      ;;
+    log,--*)
+      __gitcomp "
+        --limit= --revision= --verbose --incremental
+        --oneline --show-commit --non-recursive
+        --authors-file= --color
+        "
+      ;;
+    rebase,--*)
+      __gitcomp "
+        --merge --verbose --strategy= --local
+        --fetch-all --dry-run $fc_opts
+        "
+      ;;
+    commit-diff,--*)
+      __gitcomp "--message= --file= --revision= $cmt_opts"
+      ;;
+    info,--*)
+      __gitcomp "--url"
+      ;;
+    branch,--*)
+      __gitcomp "--dry-run --message --tag"
+      ;;
+    tag,--*)
+      __gitcomp "--dry-run --message"
+      ;;
+    blame,--*)
+      __gitcomp "--git-format"
+      ;;
+    migrate,--*)
+      __gitcomp "
+        --config-dir= --ignore-paths= --minimize
+        --no-auth-cache --username=
+        "
+      ;;
+    reset,--*)
+      __gitcomp "--revision= --parent"
+      ;;
+    *)
+      ;;
+    esac
+  fi
+}
+
+_git_tag ()
+{
+  local i c=1 f=0
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    -d|-v)
+      __gitcomp_nl "$(__git_tags)"
+      return
+      ;;
+    -f)
+      f=1
+      ;;
+    esac
+    ((c++))
+  done
+
+  case "$prev" in
+  -m|-F)
+    ;;
+  -*|tag)
+    if [ $f = 1 ]; then
+      __gitcomp_nl "$(__git_tags)"
+    fi
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+_git_whatchanged ()
+{
+  _git_log
+}
+
+__git_main ()
+{
+  local i c=1 command __git_dir
+
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    --git-dir=*) __git_dir="${i#--git-dir=}" ;;
+    --git-dir)   ((c++)) ; __git_dir="${words[c]}" ;;
+    --bare)      __git_dir="." ;;
+    --help) command="help"; break ;;
+    -c|--work-tree|--namespace) ((c++)) ;;
+    -*) ;;
+    *) command="$i"; break ;;
+    esac
+    ((c++))
+  done
+
+  if [ -z "$command" ]; then
+    case "$cur" in
+    --*)   __gitcomp "
+      --paginate
+      --no-pager
+      --git-dir=
+      --bare
+      --version
+      --exec-path
+      --exec-path=
+      --html-path
+      --man-path
+      --info-path
+      --work-tree=
+      --namespace=
+      --no-replace-objects
+      --help
+      "
+      ;;
+    *)     __git_compute_porcelain_commands
+           __gitcomp "$__git_porcelain_commands $(__git_aliases)" ;;
+    esac
+    return
+  fi
+
+  local completion_func="_git_${command//-/_}"
+  declare -f $completion_func >/dev/null && $completion_func && return
+
+  local expansion=$(__git_aliased_command "$command")
+  if [ -n "$expansion" ]; then
+    completion_func="_git_${expansion//-/_}"
+    declare -f $completion_func >/dev/null && $completion_func
+  fi
+}
+
+__gitk_main ()
+{
+  __git_has_doubledash && return
+
+  local g="$(__gitdir)"
+  local merge=""
+  if [ -f "$g/MERGE_HEAD" ]; then
+    merge="--merge"
+  fi
+  case "$cur" in
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_gitk_options
+      $merge
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+if [[ -n ${ZSH_VERSION-} ]]; then
+  echo "WARNING: this script is deprecated, please see git-completion.zsh" 1>&2
+
+  autoload -U +X compinit && compinit
+
+  __gitcomp ()
+  {
+    emulate -L zsh
+
+    local cur_="${3-$cur}"
+
+    case "$cur_" in
+    --*=)
+      ;;
+    *)
+      local c IFS=$' \t\n'
+      local -a array
+      for c in ${=1}; do
+        c="$c${4-}"
+        case $c in
+        --*=*|*.) ;;
+        *) c="$c " ;;
+        esac
+        array[$#array+1]="$c"
+      done
+      compset -P '*[=:]'
+      compadd -Q -S '' -p "${2-}" -a -- array && _ret=0
+      ;;
+    esac
+  }
+
+  __gitcomp_nl ()
+  {
+    emulate -L zsh
+
+    local IFS=$'\n'
+    compset -P '*[=:]'
+    compadd -Q -S "${4- }" -p "${2-}" -- ${=1} && _ret=0
+  }
+
+  __gitcomp_file ()
+  {
+    emulate -L zsh
+
+    local IFS=$'\n'
+    compset -P '*[=:]'
+    compadd -Q -p "${2-}" -f -- ${=1} && _ret=0
+  }
+
+  _git ()
+  {
+    local _ret=1 cur cword prev
+    cur=${words[CURRENT]}
+    prev=${words[CURRENT-1]}
+    let cword=CURRENT-1
+    emulate ksh -c __${service}_main
+    let _ret && _default && _ret=0
+    return _ret
+  }
+
+  compdef _git git gitk
+  return
+fi
+
+__git_func_wrap ()
+{
+  local cur words cword prev
+  _get_comp_words_by_ref -n =: cur words cword prev
+  $1
+}
+
+# Setup completion for certain functions defined above by setting common
+# variables and workarounds.
+# This is NOT a public function; use at your own risk.
+__git_complete ()
+{
+  local wrapper="__git_wrap${2}"
+  eval "$wrapper () { __git_func_wrap $2 ; }"
+  complete -o bashdefault -o default -o nospace -F $wrapper $1 2>/dev/null \
+    || complete -o default -o nospace -F $wrapper $1
+}
+
+# wrapper for backwards compatibility
+_git ()
+{
+  __git_wrap__git_main
+}
+
+# wrapper for backwards compatibility
+_gitk ()
+{
+  __git_wrap__gitk_main
+}
+
+__git_complete git __git_main
+__git_complete gitk __gitk_main
+
+# The following are necessary only for Cygwin, and only are needed
+# when the user has tab-completed the executable name and consequently
+# included the '.exe' suffix.
+#
+if [ Cygwin = "$(uname -o 2>/dev/null)" ]; then
+__git_complete git.exe __git_main
+fi
diff --git a/paddle/scripts/docker/root/.scripts/git-prompt.sh b/paddle/scripts/docker/root/.scripts/git-prompt.sh
new file mode 100755
index 0000000000000000000000000000000000000000..576f4ec14c94a24ebffa9e2620acf881e6b5ddaa
--- /dev/null
+++ b/paddle/scripts/docker/root/.scripts/git-prompt.sh
@@ -0,0 +1,445 @@
+# bash/zsh git prompt support
+#
+# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
+# Distributed under the GNU General Public License, version 2.0.
+#
+# This script allows you to see repository status in your prompt.
+#
+# To enable:
+#
+#    1) Copy this file to somewhere (e.g. ~/.git-prompt.sh).
+#    2) Add the following line to your .bashrc/.zshrc:
+#        source ~/.git-prompt.sh
+#    3a) Change your PS1 to call __git_ps1 as
+#        command-substitution:
+#        Bash: PS1='[\u@\h \W$(__git_ps1 " (%s)")]\$ '
+#        ZSH:  setopt PROMPT_SUBST ; PS1='[%n@%m %c$(__git_ps1 " (%s)")]\$ '
+#        the optional argument will be used as format string.
+#    3b) Alternatively, for a slightly faster prompt, __git_ps1 can
+#        be used for PROMPT_COMMAND in Bash or for precmd() in Zsh
+#        with two parameters, <pre> and <post>, which are strings
+#        you would put in $PS1 before and after the status string
+#        generated by the git-prompt machinery.  e.g.
+#        Bash: PROMPT_COMMAND='__git_ps1 "\u@\h:\w" "\\\$ "'
+#          will show username, at-sign, host, colon, cwd, then
+#          various status string, followed by dollar and SP, as
+#          your prompt.
+#        ZSH:  precmd () { __git_ps1 "%n" ":%~$ " "|%s" }
+#          will show username, pipe, then various status string,
+#          followed by colon, cwd, dollar and SP, as your prompt.
+#        Optionally, you can supply a third argument with a printf
+#        format string to finetune the output of the branch status
+#
+# The repository status will be displayed only if you are currently in a
+# git repository. The %s token is the placeholder for the shown status.
+#
+# The prompt status always includes the current branch name.
+#
+# In addition, if you set GIT_PS1_SHOWDIRTYSTATE to a nonempty value,
+# unstaged (*) and staged (+) changes will be shown next to the branch
+# name.  You can configure this per-repository with the
+# bash.showDirtyState variable, which defaults to true once
+# GIT_PS1_SHOWDIRTYSTATE is enabled.
+#
+# You can also see if currently something is stashed, by setting
+# GIT_PS1_SHOWSTASHSTATE to a nonempty value. If something is stashed,
+# then a '$' will be shown next to the branch name.
+#
+# If you would like to see if there're untracked files, then you can set
+# GIT_PS1_SHOWUNTRACKEDFILES to a nonempty value. If there're untracked
+# files, then a '%' will be shown next to the branch name.  You can
+# configure this per-repository with the bash.showUntrackedFiles
+# variable, which defaults to true once GIT_PS1_SHOWUNTRACKEDFILES is
+# enabled.
+#
+# If you would like to see the difference between HEAD and its upstream,
+# set GIT_PS1_SHOWUPSTREAM="auto".  A "<" indicates you are behind, ">"
+# indicates you are ahead, "<>" indicates you have diverged and "="
+# indicates that there is no difference. You can further control
+# behaviour by setting GIT_PS1_SHOWUPSTREAM to a space-separated list
+# of values:
+#
+#     verbose       show number of commits ahead/behind (+/-) upstream
+#     legacy        don't use the '--count' option available in recent
+#                   versions of git-rev-list
+#     git           always compare HEAD to @{upstream}
+#     svn           always compare HEAD to your SVN upstream
+#
+# By default, __git_ps1 will compare HEAD to your SVN upstream if it can
+# find one, or @{upstream} otherwise.  Once you have set
+# GIT_PS1_SHOWUPSTREAM, you can override it on a per-repository basis by
+# setting the bash.showUpstream config variable.
+#
+# If you would like to see more information about the identity of
+# commits checked out as a detached HEAD, set GIT_PS1_DESCRIBE_STYLE
+# to one of these values:
+#
+#     contains      relative to newer annotated tag (v1.6.3.2~35)
+#     branch        relative to newer tag or branch (master~4)
+#     describe      relative to older annotated tag (v1.6.3.1-13-gdd42c2f)
+#     default       exactly matching tag
+#
+# If you would like a colored hint about the current dirty state, set
+# GIT_PS1_SHOWCOLORHINTS to a nonempty value. The colors are based on
+# the colored output of "git status -sb" and are available only when
+# using __git_ps1 for PROMPT_COMMAND or precmd.
+
+# stores the divergence from upstream in $p
+# used by GIT_PS1_SHOWUPSTREAM
+__git_ps1_show_upstream ()
+{
+  local key value
+  local svn_remote svn_url_pattern count n
+  local upstream=git legacy="" verbose=""
+
+  svn_remote=()
+  # get some config options from git-config
+  local output="$(git config -z --get-regexp '^(svn-remote\..*\.url|bash\.showupstream)$' 2>/dev/null | tr '\0\n' '\n ')"
+  while read -r key value; do
+    case "$key" in
+    bash.showupstream)
+      GIT_PS1_SHOWUPSTREAM="$value"
+      if [[ -z "${GIT_PS1_SHOWUPSTREAM}" ]]; then
+        p=""
+        return
+      fi
+      ;;
+    svn-remote.*.url)
+      svn_remote[$((${#svn_remote[@]} + 1))]="$value"
+      svn_url_pattern+="\\|$value"
+      upstream=svn+git # default upstream is SVN if available, else git
+      ;;
+    esac
+  done <<< "$output"
+
+  # parse configuration values
+  for option in ${GIT_PS1_SHOWUPSTREAM}; do
+    case "$option" in
+    git|svn) upstream="$option" ;;
+    verbose) verbose=1 ;;
+    legacy)  legacy=1  ;;
+    esac
+  done
+
+  # Find our upstream
+  case "$upstream" in
+  git)    upstream="@{upstream}" ;;
+  svn*)
+    # get the upstream from the "git-svn-id: ..." in a commit message
+    # (git-svn uses essentially the same procedure internally)
+    local -a svn_upstream
+    svn_upstream=($(git log --first-parent -1 \
+          --grep="^git-svn-id: \(${svn_url_pattern#??}\)" 2>/dev/null))
+    if [[ 0 -ne ${#svn_upstream[@]} ]]; then
+      svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
+      svn_upstream=${svn_upstream%@*}
+      local n_stop="${#svn_remote[@]}"
+      for ((n=1; n <= n_stop; n++)); do
+        svn_upstream=${svn_upstream#${svn_remote[$n]}}
+      done
+
+      if [[ -z "$svn_upstream" ]]; then
+        # default branch name for checkouts with no layout:
+        upstream=${GIT_SVN_ID:-git-svn}
+      else
+        upstream=${svn_upstream#/}
+      fi
+    elif [[ "svn+git" = "$upstream" ]]; then
+      upstream="@{upstream}"
+    fi
+    ;;
+  esac
+
+  # Find how many commits we are ahead/behind our upstream
+  if [[ -z "$legacy" ]]; then
+    count="$(git rev-list --count --left-right \
+        "$upstream"...HEAD 2>/dev/null)"
+  else
+    # produce equivalent output to --count for older versions of git
+    local commits
+    if commits="$(git rev-list --left-right "$upstream"...HEAD 2>/dev/null)"
+    then
+      local commit behind=0 ahead=0
+      for commit in $commits
+      do
+        case "$commit" in
+        "<"*) ((behind++)) ;;
+        *)    ((ahead++))  ;;
+        esac
+      done
+      count="$behind  $ahead"
+    else
+      count=""
+    fi
+  fi
+
+  # calculate the result
+  if [[ -z "$verbose" ]]; then
+    case "$count" in
+    "") # no upstream
+      p="" ;;
+    "0  0") # equal to upstream
+      p="=" ;;
+    "0  "*) # ahead of upstream
+      p=">" ;;
+    *"  0") # behind upstream
+      p="<" ;;
+    *)      # diverged from upstream
+      p="<>" ;;
+    esac
+  else
+    case "$count" in
+    "") # no upstream
+      p="" ;;
+    "0  0") # equal to upstream
+      p=" u=" ;;
+    "0  "*) # ahead of upstream
+      p=" u+${count#0 }" ;;
+    *"  0") # behind upstream
+      p=" u-${count%  0}" ;;
+    *)      # diverged from upstream
+      p=" u+${count#* }-${count%  *}" ;;
+    esac
+  fi
+
+}
+
+# Helper function that is meant to be called from __git_ps1.  It
+# injects color codes into the appropriate gitstring variables used
+# to build a gitstring.
+__git_ps1_colorize_gitstring ()
+{
+  if [[ -n ${ZSH_VERSION-} ]]; then
+    local c_red='%F{red}'
+    local c_green='%F{green}'
+    local c_lblue='%F{blue}'
+    local c_clear='%f'
+  else
+    # Using \[ and \] around colors is necessary to prevent
+    # issues with command line editing/browsing/completion!
+    local c_red='\[\e[31m\]'
+    local c_green='\[\e[32m\]'
+    local c_lblue='\[\e[1;34m\]'
+    local c_clear='\[\e[0m\]'
+  fi
+  local bad_color=$c_red
+  local ok_color=$c_green
+  local flags_color="$c_lblue"
+
+  local branch_color=""
+  if [ $detached = no ]; then
+    branch_color="$ok_color"
+  else
+    branch_color="$bad_color"
+  fi
+  c="$branch_color$c"
+
+  z="$c_clear$z"
+  if [ "$w" = "*" ]; then
+    w="$bad_color$w"
+  fi
+  if [ -n "$i" ]; then
+    i="$ok_color$i"
+  fi
+  if [ -n "$s" ]; then
+    s="$flags_color$s"
+  fi
+  if [ -n "$u" ]; then
+    u="$bad_color$u"
+  fi
+  r="$c_clear$r"
+}
+
+# __git_ps1 accepts 0 or 1 arguments (i.e., format string)
+# when called from PS1 using command substitution
+# in this mode it prints text to add to bash PS1 prompt (includes branch name)
+#
+# __git_ps1 requires 2 or 3 arguments when called from PROMPT_COMMAND (pc)
+# in that case it _sets_ PS1. The arguments are parts of a PS1 string.
+# when two arguments are given, the first is prepended and the second appended
+# to the state string when assigned to PS1.
+# The optional third parameter will be used as printf format string to further
+# customize the output of the git-status string.
+# In this mode you can request colored hints using GIT_PS1_SHOWCOLORHINTS=true
+__git_ps1 ()
+{
+  local pcmode=no
+  local detached=no
+  local ps1pc_start='\u@\h:\w '
+  local ps1pc_end='\$ '
+  local printf_format=' (%s)'
+
+  case "$#" in
+    2|3)  pcmode=yes
+      ps1pc_start="$1"
+      ps1pc_end="$2"
+      printf_format="${3:-$printf_format}"
+    ;;
+    0|1)  printf_format="${1:-$printf_format}"
+    ;;
+    *)  return
+    ;;
+  esac
+
+  local repo_info rev_parse_exit_code
+  repo_info="$(git rev-parse --git-dir --is-inside-git-dir \
+    --is-bare-repository --is-inside-work-tree \
+    --short HEAD 2>/dev/null)"
+  rev_parse_exit_code="$?"
+
+  if [ -z "$repo_info" ]; then
+    if [ $pcmode = yes ]; then
+      #In PC mode PS1 always needs to be set
+      PS1="$ps1pc_start$ps1pc_end"
+    fi
+    return
+  fi
+
+  local short_sha
+  if [ "$rev_parse_exit_code" = "0" ]; then
+    short_sha="${repo_info##*$'\n'}"
+    repo_info="${repo_info%$'\n'*}"
+  fi
+  local inside_worktree="${repo_info##*$'\n'}"
+  repo_info="${repo_info%$'\n'*}"
+  local bare_repo="${repo_info##*$'\n'}"
+  repo_info="${repo_info%$'\n'*}"
+  local inside_gitdir="${repo_info##*$'\n'}"
+  local g="${repo_info%$'\n'*}"
+
+  local r=""
+  local b=""
+  local step=""
+  local total=""
+  if [ -d "$g/rebase-merge" ]; then
+    read b 2>/dev/null <"$g/rebase-merge/head-name"
+    read step 2>/dev/null <"$g/rebase-merge/msgnum"
+    read total 2>/dev/null <"$g/rebase-merge/end"
+    if [ -f "$g/rebase-merge/interactive" ]; then
+      r="|REBASE-i"
+    else
+      r="|REBASE-m"
+    fi
+  else
+    if [ -d "$g/rebase-apply" ]; then
+      read step 2>/dev/null <"$g/rebase-apply/next"
+      read total 2>/dev/null <"$g/rebase-apply/last"
+      if [ -f "$g/rebase-apply/rebasing" ]; then
+        read b 2>/dev/null <"$g/rebase-apply/head-name"
+        r="|REBASE"
+      elif [ -f "$g/rebase-apply/applying" ]; then
+        r="|AM"
+      else
+        r="|AM/REBASE"
+      fi
+    elif [ -f "$g/MERGE_HEAD" ]; then
+      r="|MERGING"
+    elif [ -f "$g/CHERRY_PICK_HEAD" ]; then
+      r="|CHERRY-PICKING"
+    elif [ -f "$g/REVERT_HEAD" ]; then
+      r="|REVERTING"
+    elif [ -f "$g/BISECT_LOG" ]; then
+      r="|BISECTING"
+    fi
+
+    if [ -n "$b" ]; then
+      :
+    elif [ -h "$g/HEAD" ]; then
+      # symlink symbolic ref
+      b="$(git symbolic-ref HEAD 2>/dev/null)"
+    else
+      local head=""
+      if ! read head 2>/dev/null <"$g/HEAD"; then
+        if [ $pcmode = yes ]; then
+          PS1="$ps1pc_start$ps1pc_end"
+        fi
+        return
+      fi
+      # is it a symbolic ref?
+      b="${head#ref: }"
+      if [ "$head" = "$b" ]; then
+        detached=yes
+        b="$(
+        case "${GIT_PS1_DESCRIBE_STYLE-}" in
+        (contains)
+          git describe --contains HEAD ;;
+        (branch)
+          git describe --contains --all HEAD ;;
+        (describe)
+          git describe HEAD ;;
+        (* | default)
+          git describe --tags --exact-match HEAD ;;
+        esac 2>/dev/null)" ||
+
+        b="$short_sha..."
+        b="($b)"
+      fi
+    fi
+  fi
+
+  if [ -n "$step" ] && [ -n "$total" ]; then
+    r="$r $step/$total"
+  fi
+
+  local w=""
+  local i=""
+  local s=""
+  local u=""
+  local c=""
+  local p=""
+
+  if [ "true" = "$inside_gitdir" ]; then
+    if [ "true" = "$bare_repo" ]; then
+      c="BARE:"
+    else
+      b="GIT_DIR!"
+    fi
+  elif [ "true" = "$inside_worktree" ]; then
+    if [ -n "${GIT_PS1_SHOWDIRTYSTATE-}" ] &&
+       [ "$(git config --bool bash.showDirtyState)" != "false" ]
+    then
+      git diff --no-ext-diff --quiet --exit-code || w="*"
+      if [ -n "$short_sha" ]; then
+        git diff-index --cached --quiet HEAD -- || i="+"
+      else
+        i="#"
+      fi
+    fi
+    if [ -n "${GIT_PS1_SHOWSTASHSTATE-}" ] &&
+       [ -r "$g/refs/stash" ]; then
+      s="$"
+    fi
+
+    if [ -n "${GIT_PS1_SHOWUNTRACKEDFILES-}" ] &&
+       [ "$(git config --bool bash.showUntrackedFiles)" != "false" ] &&
+       git ls-files --others --exclude-standard --error-unmatch -- '*' >/dev/null 2>/dev/null
+    then
+      u="%${ZSH_VERSION+%}"
+    fi
+
+    if [ -n "${GIT_PS1_SHOWUPSTREAM-}" ]; then
+      __git_ps1_show_upstream
+    fi
+  fi
+
+  local z="${GIT_PS1_STATESEPARATOR-" "}"
+
+  # NO color option unless in PROMPT_COMMAND mode
+  if [ $pcmode = yes ] && [ -n "${GIT_PS1_SHOWCOLORHINTS-}" ]; then
+    __git_ps1_colorize_gitstring
+  fi
+
+  local f="$w$i$s$u"
+  local gitstring="$c${b##refs/heads/}${f:+$z$f}$r$p"
+
+  if [ $pcmode = yes ]; then
+    if [[ -n ${ZSH_VERSION-} ]]; then
+      gitstring=$(printf -- "$printf_format" "$gitstring")
+    else
+      printf -v gitstring -- "$printf_format" "$gitstring"
+    fi
+    PS1="$ps1pc_start$gitstring$ps1pc_end"
+  else
+    printf -- "$printf_format" "$gitstring"
+  fi
+}
diff --git a/paddle/api/test/run_tests.sh b/paddle/scripts/run_python_tests.sh
similarity index 63%
rename from paddle/api/test/run_tests.sh
rename to paddle/scripts/run_python_tests.sh
index bcf06afa86aaa1a3151aeb966b54f69657c541e3..02d2cdb977473c1032b06ffca59544b3ba98d1fa 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/scripts/run_python_tests.sh
@@ -18,19 +18,29 @@ pushd `dirname $0` > /dev/null
 SCRIPTPATH=$PWD
 popd > /dev/null
 
-cd $SCRIPTPATH
+USE_VIRTUALENV_FOR_TEST=$1; shift
+PYTHON=$1; shift
 
-$1 -m pip install ../../dist/*.whl
+if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
+   rm -rf .test_env
+   virtualenv .test_env
+   source .test_env/bin/activate
+   PYTHON=python
+fi
 
-test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
+export PYTHONPATH=$SCRIPTPATH/../../python/
+$PYTHON -m pip install $SCRIPTPATH/../dist/*.whl requests matplotlib opencv-python ipython==5.3
 
-export PYTHONPATH=$PWD/../../../python/
-
-for fn in $test_list
+for fn in "$@"
 do
   echo "test $fn"
-  $1 $fn
+  $PYTHON $fn
   if [ $? -ne 0 ]; then
     exit 1
   fi
 done
+
+if [ $USE_VIRTUALENV_FOR_TEST -ne 0 ]; then
+    deactivate
+    rm -rf .test_env
+fi
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 283fd34a6d8a2268f3800ec69920e128ac75e7dc..12bf629ea920832f96bc5f7cc0b38abfddd34d97 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,9 +21,7 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
-        echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
 }
 
 function ver2num() {
@@ -52,7 +50,7 @@ if [ -z "${PADDLE_NO_STAT+x}" ]; then
         -c ${PADDLE_CONF_HOME}/paddle.cookie \
         http://api.paddlepaddle.org/version 2>/dev/null`
     if [ $? -eq 0 ] && [ "$(ver2num @PADDLE_VERSION@)" -lt  $(ver2num $SERVER_VER) ]; then
-      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org" 
+      echo "Paddle release a new version ${SERVER_VER}, you can get the install package in http://www.paddlepaddle.org"
     fi
 fi
 
@@ -94,16 +92,22 @@ else:
 EOF
 
 if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-   echo "First time run paddle, need to install some python dependencies."
-   BASEDIR=$(dirname "$0")
-   pip install ${BASEDIR}/../opt/paddle/share/wheels/*-@PADDLE_VERSION@-*.whl
-   if [ $? -ne 0 ]; then
-      echo "pip install wheels failed. "
-      echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-      echo "PaddlePaddle will install some python dependencies automatically."
-      exit 1
-   fi
-   echo "Python dependencies are installed."
+    echo "First time run paddle, need to install some python dependencies."
+    # setuptools normalizes package version, so we need to use normalized
+    # package version for paddle python package
+    PYTHON_PADDLE_VERSION=$(python -c 'import packaging.version
+import setuptools
+print str(packaging.version.Version("@PADDLE_VERSION@"))
+' 2>/dev/null)
+    BASEDIR=$(dirname "$0")
+    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
+    if [ $? -ne 0 ]; then
+	echo "pip install wheels failed. "
+	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
+	echo "PaddlePaddle will install some python dependencies automatically."
+	exit 1
+    fi
+    echo "Python dependencies are installed."
 fi
 
 case "$1" in
@@ -122,6 +126,9 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
+    "usage")
+        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
deleted file mode 100644
index 78dc756bd1175019d90fc852635497fea1eb55e2..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ /dev/null
@@ -1,7 +0,0 @@
-FROM paddledev/paddle:cpu-devel-latest
-COPY build.sh /
-RUN pip install sphinx &&\
-    pip install sphinx_rtd_theme &&\
-    apt install -y doxygen graphviz &&\
-    pip install recommonmark numpy protobuf==2.6.1
-CMD /build.sh
diff --git a/paddle/scripts/tools/build_docs/build.sh b/paddle/scripts/tools/build_docs/build.sh
deleted file mode 100755
index a23b6e61d45926e77015365627bfb7dca303ac65..0000000000000000000000000000000000000000
--- a/paddle/scripts/tools/build_docs/build.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/bin/bash
-set -ex
-
-mkdir -p /build
-cd /build
-cmake /paddle -DWITH_DOC=ON
-make paddle_docs paddle_docs_cn -j `nproc`
-mkdir -p /output/doc
-mkdir -p /output/doc_cn
-cp -r doc/html/* /output/doc/
-cp -r doc_cn/html/* /output/doc_cn/
-cd /
-rm -rf /paddle/build
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
index 9f8b80435c8fb17907d7da52c864a448f0d8d136..c6cbbc4eef94fb2e2fc3c1ce71734fbb23fc22d7 100755
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -1,4 +1,8 @@
 #!/bin/bash
-set -e
-docker build . -t paddle_build_doc
-docker run --rm -v $PWD/../../../../:/paddle -v $PWD:/output paddle_build_doc
+docker run --rm \
+       -v $(git rev-parse --show-toplevel):/paddle \
+       -e "WITH_GPU=OFF" \
+       -e "WITH_AVX=ON" \
+       -e "WITH_DOC=ON" \
+       -e "WOBOQ=ON" \
+       ${1:-"paddledev/paddle:dev"}
diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7dbd1f58842f50ea1df0e2476c4a493569b1dda9
--- /dev/null
+++ b/paddle/scripts/tools/usage_stat/usage.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
+KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
+# paddle config home dir, same as paddle
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+# api url, mirror url(s) will be append later
+PD_URLS="http://api.paddlepaddle.org/version"
+
+usage()
+{
+    echo "Usage: `basename $0` [options]"
+    echo "Options:"
+    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
+    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
+    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
+    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
+    echo "  -v, -i                            Verbose output and interact with user when necessary"
+    echo " --help                             display this help message"
+}
+
+eval set -- "${ARGPARSE}"
+while true; do
+    case "$1" in
+        -l|--log-file)
+            log_file=$2
+            shift 2
+            ;;
+        -e|--exit-code)
+            exit_code=$2
+            shift 2
+            ;;
+        -u|--git-user)
+            github_user=$2
+            shift 2
+            ;;
+        -n|--task-name)
+            task=$2
+            shift 2
+            ;;
+        -v|-i)
+            v=1
+            shift
+            ;;
+        --dry-run)
+            dry_run=1
+            shift
+            ;;
+        --)
+            shift
+            break
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Invalid option $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# parse the log_file to get the time costs
+if [ -s "${log_file}" ]; then
+    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
+    {if(index($2,":")==3){
+        t=substr($2,1,8);
+        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
+        if(sec<last_sec-600){day+=1;sec+=86400;}
+        last_sec=sec;
+        if(min_sec==0 || min_sec>sec){min_sec=sec;}
+        if(max_sec==0 || max_sec<sec){max_sec=sec;}
+    }}
+    END{print max_sec-min_sec}' ${log_file}`
+else
+    duration=-1
+fi
+if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
+
+# try find the user/email if not given
+if [ -z "${github_user}" ]; then
+    # search for cached username
+    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
+        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
+        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
+    else
+        # search the github-user from git config
+        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
+        git_username=`git config --get user.name 2>/dev/null`
+        git_url=`git config --get remote.origin.url 2>/dev/null`
+        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
+            # under a git url, like https://github.com/user_xxx/proj_yyy.git
+            if [ "${v}" = "1" ]; then echo " from github url..."; fi
+            github_user=`echo ${git_url} | cut -d "/" -f 4`
+            if [ "${github_user}" = "PaddlePaddle" ]; then
+                github_user=
+            fi
+        fi
+        if [ -n "${git_username}" -a -z "${github_user}" ]; then
+            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
+            github_user=${git_username}
+        fi
+    fi
+fi
+# allow user to set the user name, if it's not found
+if [ -z "${github_user}" -a "${v}" = "1" ]; then
+    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
+    github_user=${REPLY}
+    if [ -z "${github_user}" ]; then
+        # empty input, consider as one anonymous user
+        github_user="${KEEP_ANONYMOUS}"
+    fi
+fi
+if [ -n "${github_user}" -a -z "${dry_run}" ]; then
+    # valid user and not in dry-run mode, then save to cache
+    mkdir -p ${PADDLE_CONF_HOME}
+    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
+fi
+if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
+if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
+    # anonymous user should keep the var empty.
+    github_user=
+fi
+
+# read local paddle version
+paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
+if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
+
+# read local system time
+system_time=`date "+%Y%m%d%H%M%S"`
+if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
+
+# make empty job_name as default value.
+if [ -z "${task}" ]; then
+    task="(unknown_task)"
+fi
+if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
+
+# concat the curl command
+params="content={\"data_type\":\"usage\",\
+\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
+\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
+\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
+}&type=1"
+curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
+ -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
+
+if [ "${dry_run}" = "1" ]; then
+    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
+    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
+    exit 0
+else
+    for u in ${PD_URLS}; do
+        curl_cmd="${curl_cmd_prefix} ${u}"
+        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
+        ${curl_cmd} >/dev/null 2>&1
+        if [ $? -eq 0 ]; then
+            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
+            exit 0
+        else
+            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
+        fi
+    done
+    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
+    exit 1
+fi
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
deleted file mode 100755
index fd113d313e3140ad11460c1c288927b08fea88c4..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/before_install.osx.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-brew update
-brew tap homebrew/science
-brew install openblas md5sha1sum 
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index ffc48eae66aa615aab1ac007f8987ba6aba3ed8f..f2cbc561652a3c7502de94be37d75783fc40b9c1 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,26 +1,12 @@
 #!/bin/bash
 source ./common.sh
 
-python -c 'import pip; print(pip.pep425tags.get_supported())'
-
-if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  CMAKE_EXTRA="-DWITH_SWIG_PY=OFF"
-else
-  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
-fi
-
-cmake .. -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
-
 NPROC=1
-if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
-  NRPOC=`nproc`
-  make -j $NPROC
-  make coveralls
-  sudo make install
-elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  NPROC=`sysctl -n hw.ncpu`
-  make -j $NPROC
-  env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
-  sudo make install
-  sudo paddle version
-fi
+export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
+export PYTHONHOME=/opt/python/2.7.12
+export PATH=/opt/python/2.7.12/bin:${PATH}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+NRPOC=`nproc`
+make -j $NPROC
+make coveralls
+sudo make install
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
index 9b6e420ca7931f0d17da461c7579bf4dc69e18e0..f05c7530a3b0632948e4b18c477d6dc6aad04c03 100755
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,3 +2,5 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
+mkdir -p $HOME/third_party
+EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 8690fe1d40c935e119fefbc02f3a228d76d8c0f9..67b89adb4ddb7bb93cb776d64711078cb11a2784 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -2,9 +2,13 @@
 
 # Add set -e, cd to directory.
 source ./common.sh
-
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
+mkdir output
+make -j `nproc`
+find .. -name '*whl' | xargs pip install  # install all wheels.
+rm -rf *
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links
@@ -25,26 +29,42 @@ TARGET_BRANCH="gh-pages"
 # Only deploy master branch to build latest documentation.
 SOURCE_BRANCH="master"
 
-# If is not a Github pull request, and in master branch.
-if [ "$TRAVIS_PULL_REQUEST" != "false" -o "$TRAVIS_BRANCH" != "$SOURCE_BRANCH"  ]; then
-  exit 0
-fi
-
 # Clone the repo to output directory
 git clone $REPO output
 cd output
 
-# checkout github page branch
-git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+function deploy_docs() {
+  SOURCE_BRANCH=$1
+  DIR=$2
+  # If is not a Github pull request
+  if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then
+    exit 0
+  fi
+  # If it is not watched branch.
+  if [ "$TRAVIS_BRANCH" != "$SOURCE_BRANCH" ]; then
+    return
+  fi
 
-# remove old docs. mv new docs.
-rm -rf doc doc_cn
-mv ../doc/cn/html doc_cn
-mv ../doc/en/html doc
+  # checkout github page branch
+  git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+  
+  mkdir -p ${DIR}
+  # remove old docs. mv new docs.
+  set +e
+  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  set -e
+  mv ../doc/cn/html ${DIR}/doc_cn
+  mv ../doc/en/html ${DIR}/doc
+  git add .
+}
+
+deploy_docs "master" "." 
+deploy_docs "develop" "./develop/"
+deploy_docs "release/0.10.0" "./release/0.10.0/"
 
 # Check is there anything changed.
 set +e
-git diff --exit-code >/dev/null
+git diff --cached --exit-code >/dev/null
 if [ $? -eq 0 ]; then
   echo "No changes to the output on this push; exiting."
   exit 0
@@ -57,7 +77,6 @@ if [ -n $SSL_KEY ]; then  # Only push updated docs for github.com/PaddlePaddle/P
   git config user.name "Travis CI"
   git config user.email "paddle-dev@baidu.com"
   git commit -m "Deploy to GitHub Pages: ${SHA}"
-
   # Set ssh private key
   openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
   chmod 600 deploy_key
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index e3650bf1c0c4692a50e9731fcd8b832865eaac62..06d55d3abc6097fa7d4b2b2ac9e29681e0fddfd5 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -12,64 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This file is used to build paddle python binding package.
-# It will be invoked by Makefile that generated by COMAKE
 
 from setuptools import setup, Extension
 
-import numpy as np
-import api.paddle_ld_flags
-import platform
-import os
-
-system = platform.system().lower()
-
-is_osx = (system == 'darwin')
-is_win = (system == 'windows')
-is_lin = (system == 'linux')
-
-
-# The extra links will passed from COMAKE
-#   because generate paddle LDFLAGS is too complicated to do in setup.py
-#   it just read COMAKE generated LDFLAGS.
-extra_comps = []
-extra_links = []
-obj = api.paddle_ld_flags.PaddleLDFlag()
-extra_comps = obj.c_flag()
-ldflags = obj.ldflag_str()
-if ldflags is not None:
-  extra_links.extend(ldflags.split(" "))
-
-try:
-  with open('.py_paddle_extra_link_flags', 'r') as f:
-    for line in f:
-      extra_links += line.split()
-except:
-  pass
-
-if is_lin == True:
-    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
-elif is_osx == True:
-    os.environ["ARCHFLAGS"] = "-arch x86_64"
-    extra_links = ["-Wl,-all_load"] + extra_links
-
-include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
-
 setup(name="py_paddle",
-  version="@PADDLE_VERSION@",
-  ext_modules=[
-    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-       ['Paddle_wrap.cxx'],
-       language = "c++",
-       include_dirs = include_dirs,
-       extra_link_args = extra_links,
-       extra_compile_args = extra_comps
-    )
-  ],
-  packages=['py_paddle'],
-  include_dirs = include_dirs,
-  install_requires = [
-    'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=2.4.1' # The paddle protobuf version
-  ],
+      version="${PADDLE_VERSION}",
+      packages=['py_paddle'],
+      include_package_data=True,
+      package_data={'py_paddle':['*.py','_swig_paddle.so']},
+      install_requires = [
+        'nltk>=3.2.2',
+        'numpy>=1.8.0',      # The numpy is required.
+        'protobuf==${PROTOBUF_VERSION}'    # The paddle protobuf version
+      ],
+      url='http://www.paddlepaddle.org/',
+      license='Apache 2.0',
 )
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index c3207e63ce72b73a57c2e40c72c5259f0ae61bc9..9e9e948b8856d2712f8894b3d14db9c795d5f694 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -184,7 +184,6 @@ protected:
    * @param para
    */
   virtual void updateImpl(Parameter* para) {}
-  virtual void update(Parameter* para) {}
 };
 
 /**
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 13aa28ae5d9699d267858d48e46797c756487ddd..80664fa877b324af73e3e3effa11e46eac6294e2 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
     return 0.0;  // In this case, there is no meaning to calculate cost
   }
 
-  return Argument::sumCosts(outArgs);
+  return Argument::sum(outArgs);
 }
 
 void Tester::testOnePassBatch(int passId) {
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 8465addaf9e03831e914be2c73901c3b1a9d537f..b68e29cd5ea223272151e7a8b52d998832f47103 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -90,16 +90,6 @@ DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
-void Trainer::init(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-
-  init(config);
-}
-
 void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
                    bool testing,
                    const std::shared_ptr<GradientMachine>& gradientMachine,
@@ -320,7 +310,7 @@ real Trainer::checkGradient() {
   std::vector<Argument> outArgs;
 
   trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
   LOG(INFO) << "original cost=" << cost;
   trainerInternal_.getGradientMachine()->backward();
 
@@ -350,7 +340,7 @@ real Trainer::checkGradient() {
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
     parameter->setValueUpdated();
     trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sumCosts(outArgs);
+    real newCost1 = Argument::sum(outArgs);
 
     for (size_t i = 0; i < dim; ++i) {
       newp[i] = oldp[i] - step * d[i];
@@ -359,7 +349,7 @@ real Trainer::checkGradient() {
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
     parameter->setValueUpdated();
     trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sumCosts(outArgs);
+    real newCost2 = Argument::sum(outArgs);
 
     real trueDelta = 0.5 * (newCost1 - newCost2);
     real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
@@ -585,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch,
 
   trainerInternal_.getGradientMachine()->forwardBackward(
       inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
 
   offset = 0;
   for (auto& para : parameters) {
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 7cbf18ace7a5fed053653c73e62d36c388b15123..fac589d1d711affcd008f90edf87d865c8362f69 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -30,10 +30,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternal.h"
 
-#ifdef PADDLE_METRIC_LEARNING
-#include "paddle/internals/metric_learning/MetricTrainer.h"
-#endif
-
 DECLARE_int32(num_passes);
 
 namespace paddle {
@@ -71,11 +67,6 @@ public:
       const std::shared_ptr<DataProvider>& dataProvider = nullptr,
       const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
 
-  /**
-   * Initialize Trainer from command line flags.
-   */
-  void init(int argc, char** argv);
-
   /**
    * Train until num_passes reached.
    * One pass means neural network train through all training data.
@@ -206,12 +197,8 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-#ifdef PADDLE_METRIC_LEARNING
-  MetricTrainer trainerInternal_;
-#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index f3b465b444167d4624a5e99c30e1257eda53ca2c..4c5d4a0913aaf3a9932b3d67806378ece4245304 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   real cost = 0;
   {
     REGISTER_TIMER("sumCost");
-    cost = Argument::sumCosts(*outArgs);
+    cost = Argument::sum(*outArgs);
   }
 
   if (batchId % intconfig_->log_period == 0) {
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index e2fbd21e14afa7c89b82999b08bf91c1de182906..c5c1d484e5f85c774fd4b8f1d4a8d46abfa2f547 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/Common.h"
+#include <fenv.h>
+#include "paddle/pserver/ParameterServerController.h"
 #include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/StringUtil.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/RDMANetwork.h"
 
 DEFINE_bool(start_pserver, false, "Whether to start pserver");
 DECLARE_int32(gpu_id);
@@ -38,54 +36,11 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::vector<std::unique_ptr<ParameterServer2>> pservers;
-  std::vector<std::string> devices;
-
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    // round robin to loadbalance RDMA server ENGINE
-    int rdmaCpu = 0;
-    int onlineCpus = rdma::numCpus();
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    if (FLAGS_nics.empty()) {
-      pservers.resize(numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i));
-        }
-
-        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                   << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << FLAGS_port + i;
-        pservers[i]->start();
-      }
-    } else {
-      str::split(FLAGS_nics, ',', &devices);
-      pservers.resize(devices.size() * numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        for (size_t j = 0; j < devices.size(); ++j) {
-          if (FLAGS_rdma_tcp == "rdma") {
-            pservers[i * devices.size() + j].reset(new ParameterServer2(
-                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-            rdmaCpu = rdmaCpu % onlineCpus;
-          } else {
-            pservers[i * devices.size() + j].reset(
-                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-          }
-
-          CHECK(pservers[i * devices.size() + j]->init())
-              << "Fail to initialize parameter server" << devices[j]
-              << FLAGS_port + i;
-          LOG(INFO) << "pserver started : " << devices[j] << ":"
-                    << FLAGS_port + i;
-          pservers[i * devices.size() + j]->start();
-        }
-      }
-    }
+    parameterServerPtr.reset(
+        paddle::ParameterServerController::createFromGflags());
+    parameterServerPtr->start();
   }
   Trainer trainer;
   auto config = TrainerConfigHelper::createFromFlags();
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 22e07bd0e98a4cd36e6ed5860bcff0d4ae7cb1d2..08b2d8a38e2d20a357752269bd3ee3f515116abd 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,11 +1,3 @@
-################# test_Prediction ######################
-add_unittest_without_exec(test_Prediction
-    test_Prediction.cpp)
-add_test(NAME test_Prediction
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Prediction --merger=${CMAKE_CURRENT_BINARY_DIR}/../paddle_merge_model
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
 ################# test_Compare ############################
 add_unittest_without_exec(test_Compare
     test_Compare.cpp)
@@ -25,14 +17,17 @@ add_test(NAME test_Trainer
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
 ############### test_TrainerOnePass ##########################
-add_unittest_without_exec(test_TrainerOnePass
-    test_TrainerOnePass.cpp)
-add_test(NAME test_TrainerOnePass
-  COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-        ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
+if(WITH_PYTHON)
+  # only run test_TrainerOnePass when PYTHON is enabled, because train one pass
+  # is using PyDataProvider2.
+  add_unittest_without_exec(test_TrainerOnePass
+      test_TrainerOnePass.cpp)
+  add_test(NAME test_TrainerOnePass
+    COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
+          ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
+          ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+endif()
 ################ test_CompareTwoNets ######################
 add_unittest_without_exec(test_CompareTwoNets
     test_CompareTwoNets.cpp)
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index 23bfa164080a6ea392bb6ee15e7e2bec25257ce9..4aa64961d096ce94a4187fe94000b05de4080122 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -1059,14 +1059,14 @@ inline bool operator==(const value& x, const value& y) {
 }
 
 inline bool operator!=(const value& x, const value& y) { return !(x == y); }
-}
+}  // namespace picojson
 
 namespace std {
 template <>
 inline void swap(picojson::value& x, picojson::value& y) {
   x.swap(y);
 }
-}
+}  // namespace std
 
 inline std::istream& operator>>(std::istream& is, picojson::value& x) {
   picojson::set_last_error(std::string());
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
deleted file mode 100644
index 0c79404eee1c0902c5c8e8eefd139da3da584636..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-DECLARE_string(config);
-DECLARE_string(config_args);
-DEFINE_string(merger,
-              "./paddle_merge_model",
-              "path to paddle_merge_model binary");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile = "trainer/tests/sample_trainer_config.conf";
-static const string& mergedModelFile = "./test_model_file";
-static const string& modelDir = "./test_model_dir";
-
-void checkBuffer(real* vec1, real* vec2, size_t len) {
-  for (size_t i = 0; i < len; i++) {
-    EXPECT_EQ(vec1[i], vec2[i]) << "vec1:" << vec1[i] << " vec2:" << vec2[i];
-  }
-}
-
-void checkParameters(vector<ParameterPtr> A, vector<ParameterPtr> B) {
-  CHECK_EQ(B.size(), A.size()) << "parameter size not equal";
-  for (size_t i = 0; i < A.size(); i++) {
-    auto vec1 = A[i]->getBuf(PARAMETER_VALUE);
-    auto vec2 = B[i]->getBuf(PARAMETER_VALUE);
-    CHECK_EQ(vec1->useGpu_, vec2->useGpu_) << "use gpu not equal";
-    CHECK_EQ(vec1->getSize(), vec2->getSize()) << "size not equal";
-
-    if (vec1->useGpu_ == false) {
-      checkBuffer(vec1->getData(), vec2->getData(), vec1->getSize());
-    } else {
-      VectorPtr cpuVec1 = Vector::create(vec1->getSize(), false);
-      VectorPtr cpuVec2 = Vector::create(vec2->getSize(), false);
-      cpuVec1->copyFrom(*vec1, HPPL_STREAM_DEFAULT);
-      cpuVec2->copyFrom(*vec2, HPPL_STREAM_DEFAULT);
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      checkBuffer(cpuVec1->getData(), cpuVec2->getData(), cpuVec1->getSize());
-    }
-  }
-}
-
-TEST(GradientMachine, create) {
-#ifdef PADDLE_ONLY_CPU
-  FLAGS_use_gpu = false;
-#endif
-  mkDir(modelDir.c_str());
-  FLAGS_config = configFile;
-  FLAGS_config_args = "with_cost=False";
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-
-  // save model to directory
-  unique_ptr<GradientMachine> gradientMachine1(
-      GradientMachine::create(*config));
-  gradientMachine1->saveParameters(modelDir);
-  Trainer trainer;
-  trainer.init(config);
-  ParameterUtil* paramUtil = trainer.getParameterUtilPtr();
-  if (paramUtil != NULL) {
-    paramUtil->saveConfigWithPath(modelDir);
-  }
-
-  // create a different GradientMachine
-  unique_ptr<GradientMachine> gradientMachine2(
-      GradientMachine::create(*config));
-  gradientMachine2->randParameters();
-
-  // merge config and model to one file
-  string cmd = FLAGS_merger + " --model_dir=" + modelDir +
-               " --config_args=with_cost=False" + " --model_file=" +
-               mergedModelFile;
-  LOG(INFO) << cmd;
-  int ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  // create GradientMachine from the merged model
-  DataConfig dataConfig;
-  unique_ptr<GradientMachine> gradientMachine3(
-      GradientMachine::create(mergedModelFile, &dataConfig));
-  CHECK(gradientMachine3);
-  EXPECT_EQ(dataConfig.type(), "simple");
-  EXPECT_EQ(dataConfig.feat_dim(), 3);
-
-  // compare the parameters of GradientMachine and GradientMachine3
-  std::vector<ParameterPtr> paraMachine1 = gradientMachine1->getParameters();
-  std::vector<ParameterPtr> paraMachine3 = gradientMachine3->getParameters();
-  checkParameters(paraMachine1, paraMachine3);
-
-  // Test that the GradientMachine created from the merged model
-  // is same as the orginnal one.
-  vector<Argument> inArgs(1);
-  vector<Argument> outArgs;
-
-  int inputDim = 3;
-  int numSamples = 2;
-  CpuMatrix cpuInput(numSamples, inputDim);
-  for (int i = 0; i < numSamples; ++i) {
-    for (int j = 0; j < inputDim; ++j) {
-      cpuInput.getData()[i * inputDim + j] =
-          rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
-    }
-  }
-  MatrixPtr input = Matrix::create(numSamples,
-                                   inputDim,
-                                   /* trans */ false,
-                                   FLAGS_use_gpu);
-  input->copyFrom(cpuInput);
-  inArgs[0].value = input;
-  gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
-  EXPECT_EQ((size_t)1, outArgs.size());
-
-  vector<Argument> outArgs2;
-  gradientMachine2->forward(inArgs, &outArgs2, PASS_TEST);
-  CpuMatrix out1(outArgs[0].value->getHeight(), outArgs[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  for (size_t i = 0; i < out1.getHeight() * out1.getWidth(); i++) {
-    EXPECT_NE(out1.getData()[i], out2.getData()[i]);
-  }
-
-  gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(
-      out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth());
-
-  cmd = " rm -rf " + modelDir + "/*";
-  LOG(INFO) << "cmd " << cmd;
-  ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  cmd = " rm -rf " + mergedModelFile;
-  LOG(INFO) << "cmd " << cmd;
-  ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  // clean up
-  rmDir(modelDir.c_str());
-  remove(mergedModelFile.c_str());
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
index 956b606a18cae1bb11322accfa174ae5ce1580de..f2cfd7409412de68f4183daebcb48e7a3ae37672 100644
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
@@ -1,2 +1 @@
 enable_virtualenv.c
-PythonUtil.cpp
diff --git a/paddle/utils/Any.h b/paddle/utils/Any.h
new file mode 100644
index 0000000000000000000000000000000000000000..99a0139accc4988f1e4cce45eeb688a6603c2c31
--- /dev/null
+++ b/paddle/utils/Any.h
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#if __cplusplus > 201402L
+#include <any>
+
+namespace paddle {
+// using std::any for C++ 17
+using std::any;
+using std::any_cast;
+using std::bad_any_cast;
+}  // namespace paddle
+
+#else
+#include <any.hpp>
+
+namespace paddle {
+// use linb::any for C++ 11
+using linb::any;
+using linb::any_cast;
+using linb::bad_any_cast;
+}  // namespace paddle
+#endif
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 10d906ee16656a808122b81d8b2fef55b8e7b7e9..af59951752d1799c95e293d3eae233e6aa26e5f3 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,11 +1,9 @@
 # The utilities for paddle
-
-configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp)
-
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
-create_resources(enable_virtualenv.py enable_virtualenv.c)
-set(UTIL_RES enable_virtualenv.c)
+create_resources(${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.py
+  ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
+set(UTIL_RES ${CMAKE_CURRENT_SOURCE_DIR}/enable_virtualenv.c)
 
 if(APPLE)
     file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..cebca5a2a3766110b83231eb0705e48800a7bda6
--- /dev/null
+++ b/paddle/utils/Compiler.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+/**
+ * This header defines some useful attribute by each compiler. It is the
+ * abstract layer of compilers.
+ */
+#ifdef __GNUC__
+#define GCC_VERSION \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION
+#endif
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#if GCC_VERSION >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
index 8eefdd2980e7f56a836df6fd2ff8c31b81a55555..edd33c454122d95078e0fde2a2e9d68903951ee8 100644
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/utils/CpuId.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 /// for MSVC
 #define CPUID(info, x) __cpuidex(info, x, 0)
 
-#else
+#elif !defined(__ANDROID__)
 
 #include <cpuid.h>
 
@@ -31,6 +31,7 @@ limitations under the License. */
 namespace paddle {
 
 SIMDFlags::SIMDFlags() {
+#if !defined(__ANDROID__)
   unsigned int cpuInfo[4];
   // CPUID: https://en.wikipedia.org/wiki/CPUID
   // clang-format off
@@ -51,6 +52,9 @@ SIMDFlags::SIMDFlags() {
   CPUID(cpuInfo, 0x80000001);
   simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
   // clang-fotmat on
+#else
+  simd_flags_ = SIMD_NEON;
+#endif
 }
 
 SIMDFlags const* SIMDFlags::instance() {
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 0f3985cc7b2c018ede9bba9644d2d096561dccee..869be5be541dafd699a87a8e8893aadadf59b711 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 
 #include "Common.h"
+#include "Error.h"
 
 namespace paddle {
 
@@ -29,6 +30,7 @@ enum simd_t {
   SIMD_AVX    = 1 << 8,     ///< AVX
   SIMD_AVX2   = 1 << 9,     ///< AVX 2
   SIMD_AVX512 = 1 << 10,    ///< AVX 512
+  SIMD_NEON   = 1 << 11,    ///  NEON
 };
 // clang-format on
 
@@ -95,6 +97,40 @@ private:
 #define HAS_AVX     HAS_SIMD(SIMD_AVX)
 #define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
 #define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
+#define HAS_NEON    HAS_SIMD(SIMD_NEON)
 // clang-format on
 
+/**
+ * Invoke checkCPUFeature() before Paddle initialization to
+ * check target machine whether support compiled instructions.
+ * If not, simply throw out an error.
+ */
+inline Error __must_check checkCPUFeature() {
+  Error err;
+#ifndef __AVX__
+  if (HAS_AVX) {
+    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
+                 << "but these are available on your machine and could "
+                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
+  }
+#else
+  if (!HAS_AVX) {
+    err = Error(
+        "PaddlePaddle was compiled to use avx instructions, "
+        "but these aren't available on your machine, please "
+        "disable it via CMAKE .. -DWITH_AVX=OFF");
+  }
+#endif  // __AVX__
+#ifdef __SSE3__
+  if (!HAS_SSE3) {
+    err = Error(
+        "PaddlePaddle was compiled to use sse3 instructions, "
+        "which is the minimum requirement of PaddlePaddle. "
+        "But these aren't available on your current machine.");
+  }
+#endif  // __SSE3__
+
+  return err;
+}
+
 }  // namespace paddle
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/utils/DynamicLoader.cpp
similarity index 77%
rename from paddle/cuda/src/hl_dso_loader.cc
rename to paddle/utils/DynamicLoader.cpp
index c92909de534a875028d6d4784b02f08648c85a9a..5604a90038b06d2c1a4d9db70e4185cddfd25d3e 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/utils/DynamicLoader.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_dso_loader.h"
+#include "DynamicLoader.h"
 #include <gflags/gflags.h>
-#include "paddle/utils/Logging.h"
+#include "Logging.h"
 
 DEFINE_string(cudnn_dir,
               "",
@@ -25,13 +25,13 @@ DEFINE_string(cudnn_dir,
 DEFINE_string(cuda_dir,
               "",
               "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-              "libcudart can not be specified by cuda_dir, since some "
-              "build-in function in cudart already ran before main entry). "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -52,7 +52,7 @@ static inline std::string join(const std::string& part1,
 static inline void GetDsoHandleFromDefaultPath(std::string& dso_path,
                                                void** dso_handle,
                                                int dynload_flags) {
-  VLOG(3) << "Try to find cuda library: " << dso_path
+  VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
   *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
@@ -104,20 +104,9 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
   CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
                                 << " (" << dlerror() << ") \n"
                                 << "Please specify its path correctly using "
-                                   "one of the following ways: \n"
-
-                                << "Method 1. set cuda and cudnn lib path at "
-                                   "runtime. "
-                                << "http://www.paddlepaddle.org/doc/ui/"
-                                   "cmd_argument/"
-                                   "argument_outline.html \n"
-                                << "For instance, issue command: paddle train "
-                                   "--use_gpu=1 "
-                                << "--cuda_dir=/usr/local/cuda/lib64 "
-                                   "--cudnn_dir=/usr/local/cudnn/lib "
-                                   "...\n"
-
-                                << "Method 2. set environment variable "
+                                   "following ways: \n"
+
+                                << "Method. set environment variable "
                                    "LD_LIBRARY_PATH on Linux or "
                                 << "DYLD_LIBRARY_PATH on Mac OS. \n"
                                 << "For instance, issue command: export "
@@ -126,9 +115,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                 << "Note: After Mac OS 10.11, using the "
                                    "DYLD_LIBRARY_PATH is impossible "
                                 << "unless System Integrity Protection (SIP) "
-                                   "is disabled. However, "
-                                   "method 1 "
-                                << "always work well.";
+                                   "is disabled.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
@@ -147,14 +134,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
 #endif
 }
 
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
-
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
@@ -170,3 +149,11 @@ void GetWarpCTCDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
 #endif
 }
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapacke.so", dso_handle);
+#endif
+}
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/utils/DynamicLoader.h
similarity index 87%
rename from paddle/cuda/include/hl_dso_loader.h
rename to paddle/utils/DynamicLoader.h
index 20c13f21e61a92b0635b686f6f724ae2b44518cc..9b5ad21724afd7176f958619e7e10d12dc08fa49 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef HL_DSO_LOADER_H_
-#define HL_DSO_LOADER_H_
+#ifndef DYNAMIC_LOAD_H_
+#define DYNAMIC_LOAD_H_
 
 #include <dlfcn.h>
 #include <memory>
+#include <mutex>
 #include <string>
-#include "hl_base.h"
 
 /**
  * @brief    load the DSO of CUBLAS
@@ -37,27 +37,27 @@ void GetCublasDsoHandle(void** dso_handle);
 void GetCudnnDsoHandle(void** dso_handle);
 
 /**
- * @brief    load the DSO of CUDA Run Time
+ * @brief    load the DSO of CURAND
  *
  * @param    **dso_handle   dso handler
  *
  */
-void GetCudartDsoHandle(void** dso_handle);
+void GetCurandDsoHandle(void** dso_handle);
 
 /**
- * @brief    load the DSO of CURAND
+ * @brief    load the DSO of warp-ctc
  *
  * @param    **dso_handle   dso handler
  *
  */
-void GetCurandDsoHandle(void** dso_handle);
+void GetWarpCTCDsoHandle(void** dso_handle);
 
 /**
- * @brief    load the DSO of warp-ctc
+ * @brief    load the DSO of lapack
  *
  * @param    **dso_handle   dso handler
  *
  */
-void GetWarpCTCDsoHandle(void** dso_handle);
+void GetLapackDsoHandle(void** dso_handle);
 
-#endif  // HL_DSO_LOADER_H_
+#endif  // DYNAMIC_LOAD_H_
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda1b5c37dada8d0c6c77fc2fb03bb614d5301b5
--- /dev/null
+++ b/paddle/utils/Error.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include "Compiler.h"
+
+namespace paddle {
+
+/**
+ * Error is Paddle error code. It only contain a std::string as error message.
+ *
+ *
+ * There are two styles to return error in Paddle.
+ *
+ * 1. Return Error
+ *    When method return a status, the return must use `__must_check` attribute.
+ *    Example as below.
+ * @code{cpp}
+ * Error __must_check foo();
+ *
+ * Error __must_check bar() {
+ *   // do something.
+ *   Error err = foo();  // invoke other method return status.
+ *   if (err) return err;
+ *   // do something else.
+ *   return Error();
+ * }
+ * @endcode{cpp}
+ *
+ * 2. Return by parameter.
+ *    It is another way to return an error, by using a pointer parameter.
+ *    Example as below.
+ *
+ * @code{cpp}
+ * Error bar();
+ *
+ * int foo(Error* error) {
+ *   // Do something.
+ *   Error err = bar();
+ *   if (err) {
+ *     *error = s;
+ *     return 0;
+ *   }
+ *   // Do something else.
+ *   if (someInternalErrorHappend) {
+ *     *error = Error("Some dimension is too large, %d", dimension);
+ *     return 0;
+ *   }
+ *   // End of method.
+ *   return someValue;
+ * }
+ *
+ * Error foobar() {
+ *   Error err;
+ *   // do something.
+ *   foo(&err);
+ *   if (err) return err;
+ * }
+ * @endcode{cpp}
+ *
+ *
+ * Currently there is a helper method 'check' in status, because Paddle always
+ * use log(FATAL) or CHECK to make program exit before. When we clean all
+ * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
+ */
+class Error {
+public:
+  /**
+   * Construct a no-error value.
+   */
+  Error() {}
+
+  /**
+   * @brief Create an Error use printf syntax.
+   */
+  explicit Error(const char* fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    constexpr size_t kBufferSize = 1024;
+    char buffer[kBufferSize];
+    vsnprintf(buffer, kBufferSize, fmt, ap);
+    this->msg_.reset(new std::string(buffer));
+    va_end(ap);
+  }
+
+  /**
+   * @brief msg will return the error message. If no error, return nullptr.
+   */
+  const char* msg() const {
+    if (msg_) {
+      return msg_->c_str();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief operator bool, return True if there is something error.
+   */
+  operator bool() const { return !this->isOK(); }
+
+  /**
+   * @brief isOK return True if there is no error.
+   * @return True if no error.
+   */
+  bool isOK() const { return msg_ == nullptr; }
+
+  /**
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
+   */
+  void check() const { CHECK(this->isOK()) << msg(); }
+
+private:
+  std::shared_ptr<std::string> msg_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 59d6cbdc513660b87cb013d8aa92c5c8f9289ecb..320f671ed97dbadc4fa1b4b52d5611cf9239e7dd 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -30,15 +30,17 @@ DEFINE_bool(parallel_nn,
 DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 DEFINE_int32(port, 20134, "Listening port for pserver");
-DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
 DEFINE_int32(ports_num,
              1,
-             "The ports number for parameter send,"
-             " increment based on default port number");
+             "Number of ports for sending dense parameter,"
+             " following ports on parameter server will be visited"
+             " for sending dense parameter: [port, port+ports_num-1]");
 DEFINE_int32(ports_num_for_sparse,
              0,
-             "The ports number for parameter send,"
-             " increment based on default (port + ports_num)");
+             "Number of ports for sending sparse parameter,"
+             " following ports on parameter server will be visited"
+             " for sending sparse parameter:"
+             " [port+ports_num, port+ports_num+ports_num_for_sparse-1]");
 DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
 DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
 DEFINE_int32(trainer_id,
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 3e72f8356d883b353127ccae80f2881320d20b2b..dc4faef8331ed47b9ce3e952389b6469cd9fda2e 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -19,7 +19,6 @@ limitations under the License. */
 DECLARE_bool(parallel_nn);
 DECLARE_int32(async_count);
 DECLARE_int32(port);
-DECLARE_int32(data_server_port);
 DECLARE_bool(use_gpu);
 DECLARE_int32(gpu_id);
 DECLARE_int32(trainer_count);
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 707346f2c76e59b50722f4f8805ebe56c3cf861b..0ec1c28dfbb2a7db9fa84c9eb2bc4dad806b78e9 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -23,11 +23,6 @@ enum PassType {
   PASS_TEST,    // Test pass
   PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
-  // pass for metric learning training with metric learning error, only used
-  // when we are doing KNN evaluation.
-  PASS_METRIC_TRAIN,
-  PASS_METRIC_TRAIN_WITH_NOERROR,  // Pass for metric learning training
-                                   // with no evaluation.
 };
 
 enum ParameterType {
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 5a1c6ecb2219f7983609c27f3215c7fc1e9e9ef2..ea96bad240ad81c4c29b7dab35b015549052e2bb 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
  */
 
 #include "Logging.h"
+#include <cstdlib>
 
 namespace paddle {
 
diff --git a/paddle/utils/PythonUtil.cpp.in b/paddle/utils/PythonUtil.cpp
similarity index 98%
rename from paddle/utils/PythonUtil.cpp.in
rename to paddle/utils/PythonUtil.cpp
index e0caaf4cd6cf429e57ee221a0b0957a905b89973..7faeff55c28b9065179ad27b3b604a9f411249e5 100644
--- a/paddle/utils/PythonUtil.cpp.in
+++ b/paddle/utils/PythonUtil.cpp
@@ -195,8 +195,6 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
-  char PythonHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
-  Py_SetPythonHome(PythonHome);
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 0b4f4c9113ae9d714b634b67931e51b408bbe777..95f071cb7de87d87f6988c136d7993c66fa9dde1 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -54,6 +54,25 @@ inline T toWithStatus(const std::string& s, bool* ok = nullptr) {
   return v;
 }
 
+/**
+ * Cast type T to string with status.
+ *
+ * @param [in] v input value of type T.
+ * @param [out] ok status, return true if there is no error in casting. Set
+ *              nullptr if user don't care error at all.
+ * @return result of casting. If error occurred, a empty string will be
+ *              returned.
+ */
+template <class T>
+inline std::string toWithStatus(const T v, bool* ok = nullptr) {
+  std::ostringstream sout;
+  sout << v;
+  if (ok) {
+    *ok = !sout.fail();
+  }
+  return sout.str();
+}
+
 /// Convert string to type T. It makes sure all the characters in s are used.
 /// Otherwise it will abort.
 ///
@@ -67,6 +86,18 @@ inline T to(const std::string& s) {
   return v;
 }
 
+/// Convert type T to string.
+///
+/// @tparam T type of input value
+/// @param v input value of type T
+template <class T>
+std::string to_string(T v) {
+  bool ok;
+  std::string s = toWithStatus<T>(v, &ok);
+  CHECK(ok) << "Cannot convert v(" << v << ") to type std::string";
+  return s;
+}
+
 }  // namespace str
 
 #undef DEFINE_STRING_CONVERSION
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 411a64aa8d0737a8d57e62fbd0788ffaacfbc9f7..b18b73e06a6c39c3bf9717280bc6323917c80efb 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -15,17 +15,23 @@ limitations under the License. */
 #include "Util.h"
 
 #include <dirent.h>
-#include <pmmintrin.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+
+#ifdef __SSE__
 #include <xmmintrin.h>
+#endif
+#ifdef __SSE3__
+#include <pmmintrin.h>
+#endif
 
 #include <fstream>
 #include <mutex>
 
 #include <gflags/gflags.h>
 
+#include "CpuId.h"
 #include "CustomStackTrace.h"
 #include "Logging.h"
 #include "StringUtil.h"
@@ -144,26 +150,30 @@ void runInitFunctions() {
 }
 
 void initMain(int argc, char** argv) {
-  initializeLogging(argc, argv);
   installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
     line += ' ';
   }
-  LOG(INFO) << "commandline: " << line;
 
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
+  initializeLogging(argc, argv);
+  LOG(INFO) << "commandline: " << line;
   CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
 
   installProfilerSwitch();
 
+#ifdef __SSE__
   _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
+#endif
+#ifdef __SSE3__
   _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON);
+#endif
 
   if (FLAGS_seed == 0) {
     unsigned int t = time(NULL);
@@ -185,6 +195,7 @@ void initMain(int argc, char** argv) {
   }
 
   version::printVersion();
+  checkCPUFeature().check();
   runInitFunctions();
 }
 
@@ -289,6 +300,7 @@ void mkDir(const char* filename) {
 void mkDirRecursively(const char* dir) {
   struct stat sb;
 
+  if (*dir == 0) return;  // empty string
   if (!stat(dir, &sb)) return;
 
   mkDirRecursively(path::dirname(dir).c_str());
diff --git a/paddle/utils/arch/linux/Locks.cpp b/paddle/utils/arch/linux/Locks.cpp
index 2a6f96e04d024ac3977bc154dbeeb69ce9ab3a5d..310c9a6542563891d4ba5888e58406ea28d6a2ce 100644
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Locks.h"
 #include <semaphore.h>
 #include <unistd.h>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 class SemaphorePrivate {
@@ -26,7 +27,10 @@ Semaphore::Semaphore(int initValue) : m(new SemaphorePrivate()) {
   sem_init(&m->sem, 0, initValue);
 }
 
-Semaphore::~Semaphore() { sem_destroy(&m->sem); }
+Semaphore::~Semaphore() {
+  sem_destroy(&m->sem);
+  delete m;
+}
 
 bool Semaphore::timeWait(struct timespec* ts) {
   return (0 == sem_timedwait(&m->sem, ts));
@@ -36,36 +40,101 @@ void Semaphore::wait() { sem_wait(&m->sem); }
 
 void Semaphore::post() { sem_post(&m->sem); }
 
+#ifdef PADDLE_USE_PTHREAD_SPINLOCK
+
 class SpinLockPrivate {
 public:
   inline SpinLockPrivate() { pthread_spin_init(&lock_, 0); }
   inline ~SpinLockPrivate() { pthread_spin_destroy(&lock_); }
+
+  inline void lock() { pthread_spin_lock(&lock_); }
+  inline void unlock() { pthread_spin_unlock(&lock_); }
+
   pthread_spinlock_t lock_;
   char padding_[64 - sizeof(pthread_spinlock_t)];
 };
 
-SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+#else
 
-SpinLock::~SpinLock() { delete m; }
+#include <atomic>
+class SpinLockPrivate {
+public:
+  inline void lock() {
+    while (lock_.test_and_set(std::memory_order_acquire)) {
+    }
+  }
+  inline void unlock() { lock_.clear(std::memory_order_release); }
+
+  std::atomic_flag lock_ = ATOMIC_FLAG_INIT;
+  char padding_[64 - sizeof(lock_)];  // Padding to cache line size
+};
 
-void SpinLock::lock() { pthread_spin_lock(&m->lock_); }
+#endif
 
-void SpinLock::unlock() { pthread_spin_unlock(&m->lock_); }
+SpinLock::SpinLock() : m(new SpinLockPrivate()) {}
+SpinLock::~SpinLock() { delete m; }
+void SpinLock::lock() { m->lock(); }
+void SpinLock::unlock() { m->unlock(); }
+
+#ifdef PADDLE_USE_PTHREAD_BARRIER
 
 class ThreadBarrierPrivate {
 public:
   pthread_barrier_t barrier_;
+
+  inline explicit ThreadBarrierPrivate(int count) {
+    pthread_barrier_init(&barrier_, nullptr, count);
+  }
+
+  inline ~ThreadBarrierPrivate() { pthread_barrier_destroy(&barrier_); }
+
+  inline void wait() { pthread_barrier_wait(&barrier_); }
 };
 
-ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate()) {
-  pthread_barrier_init(&m->barrier_, nullptr, count);
-}
+#else
 
-ThreadBarrier::~ThreadBarrier() {
-  pthread_barrier_destroy(&m->barrier_);
-  delete m;
-}
+class ThreadBarrierPrivate {
+public:
+  pthread_mutex_t mutex_;
+  pthread_cond_t cond_;
+  int count_;
+  int tripCount_;
+
+  inline explicit ThreadBarrierPrivate(int cnt) : count_(0), tripCount_(cnt) {
+    CHECK_NE(cnt, 0);
+    CHECK_GE(pthread_mutex_init(&mutex_, 0), 0);
+    CHECK_GE(pthread_cond_init(&cond_, 0), 0);
+  }
+
+  inline ~ThreadBarrierPrivate() {
+    pthread_cond_destroy(&cond_);
+    pthread_mutex_destroy(&mutex_);
+  }
+
+  /**
+   * @brief wait
+   * @return true if the last wait
+   */
+  inline bool wait() {
+    pthread_mutex_lock(&mutex_);
+    ++count_;
+    if (count_ >= tripCount_) {
+      count_ = 0;
+      pthread_cond_broadcast(&cond_);
+      pthread_mutex_unlock(&mutex_);
+      return true;
+    } else {
+      pthread_cond_wait(&cond_, &mutex_);
+      pthread_mutex_unlock(&mutex_);
+      return false;
+    }
+  }
+};
+
+#endif
 
-void ThreadBarrier::wait() { pthread_barrier_wait(&m->barrier_); }
+ThreadBarrier::ThreadBarrier(int count) : m(new ThreadBarrierPrivate(count)) {}
+ThreadBarrier::~ThreadBarrier() { delete m; }
+void ThreadBarrier::wait() { m->wait(); }
 
 }  // namespace paddle
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 26fafbd1ab3f2967b765b8bcb973fb745c0e6422..aa923b355377752f9b297a125f5c43c364ba9b06 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_CustomStackTrace)
 add_simple_unittest(test_ThreadBarrier)
 add_simple_unittest(test_SpinLock)
 add_simple_unittest(test_SIMDFlags)
+add_simple_unittest(test_Error)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 378788bcecd579fff1c762702a8c27f54cee94bf..b5d9f93f1376048eabd726331006b0bb848bce11 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -19,6 +19,7 @@ limitations under the License. */
 
 #include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 DEFINE_int32(test_thread_num, 10, "testing thread number");
@@ -69,11 +70,11 @@ TEST(CustomStackTrace, normalTrain) {
     while (countDown-- > 0) {
       start.wait();
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + std::to_string(i));
+        tracer.push("layer_" + paddle::str::to_string(i));
       }
       tracer.pop("");
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.pop("layer_" + std::to_string(layerSize - 1 - i));
+        tracer.pop("layer_" + paddle::str::to_string(layerSize - 1 - i));
       }
       finish.wait();
     }
@@ -89,7 +90,7 @@ TEST(CustomStackTrace, normalTest) {
     while (countDown-- > 0) {
       start.wait();
       for (size_t i = 0; i < layerSize; ++i) {
-        tracer.push("layer_" + std::to_string(i));
+        tracer.push("layer_" + paddle::str::to_string(i));
       }
       tracer.clear();  // in forward test, tracer will clear after forward.
       finish.wait();
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index 611b16aa7116d03ee51ba0095d043b78df1742ba..360c61c88a757da708b01d2bb54068b948b235cc 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -13,13 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/StringUtil.h"
 #include "paddle/utils/Util.h"
 
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
 
   for (size_t i = 0; i < 1000; ++i) {
-    paddle::gLayerStackTrace.push("layer_" + std::to_string(i));
+    paddle::gLayerStackTrace.push("layer_" + paddle::str::to_string(i));
     if (i == 998) {
       throw "Unhandle exception";
     }
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdf326b17a1c8baa87e2a17fafae253565d1e699
--- /dev/null
+++ b/paddle/utils/tests/test_Error.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Error.h"
+
+#include <gtest/gtest.h>
+
+TEST(Error, testAll) {
+  paddle::Error error;
+  ASSERT_FALSE(error);
+  error = paddle::Error("I'm the error");
+  ASSERT_TRUE(error);
+  ASSERT_STREQ("I'm the error", error.msg());
+
+  error = paddle::Error("error2");
+  ASSERT_TRUE(error);
+  ASSERT_STREQ("error2", error.msg());
+
+  int i = 3;
+  auto error3 = paddle::Error("error%d", i);
+  ASSERT_TRUE(error3);
+  ASSERT_STREQ("error3", error3.msg());
+}
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 8200a24ce7b7df75b48a89fbb7af15f304c5957f..185789c927be19385d6ddc7a1889b6cc56109d38 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -18,7 +18,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 
 TEST(SIMDFlags, gccTest) {
-#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
+#if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__)) && \
+    !defined(__arm__)
   // clang-format off
   CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
   CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
@@ -43,4 +44,5 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
+  LOG(INFO) << "Has NEON:    " << std::boolalpha << HAS_NEON;
 }
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index e854b2b427e550ec491dacf931cc2d2cce7ba6c2..62d5b9e38b21ee82d1e78c3bde5aa5df7e4a33ee 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -4,7 +4,8 @@ set(proto_filenames
     ModelConfig.proto
     ParameterConfig.proto
     ParameterService.proto
-    TrainerConfig.proto)
+    TrainerConfig.proto
+    ParameterServerConfig.proto)
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 3a9d339976fff91d79e7459ad5984cf78ea8990a..4f9b53d6f6553e55406dd000029a598a92fd2fb6 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -255,6 +255,13 @@ message PriorBoxConfig {
   repeated float variance = 4;
 }
 
+message PadConfig {
+  required ImageConfig image_conf = 1;
+  repeated uint32 pad_c = 2;
+  repeated uint32 pad_h = 3;
+  repeated uint32 pad_w = 4;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -271,6 +278,7 @@ message LayerInputConfig {
   optional MaxOutConfig maxout_conf = 11;
   optional SppConfig spp_conf = 12;
   optional PriorBoxConfig priorbox_conf = 13;
+  optional PadConfig pad_conf = 14;
 }
 
 message LayerConfig {
@@ -419,20 +427,25 @@ message LayerConfig {
   // bias size
   optional uint32 bias_size = 48 [default = 0];
 
-  // this parameter can be used as a user-defined parameter when necessary, 
+  // this parameter can be used as a user-defined parameter when necessary,
   // without changing the proto file.
-  // e.g., when a new layer with a user-defined parameter is implemented, 
+  // e.g., when a new layer with a user-defined parameter is implemented,
   // it can be used to pass that parameter, without modifying the proto file.
   // string type is used for flexibility: different types can be converted
-  // to string and reinterpreted in the user's own layer implementation.  
+  // to string and reinterpreted in the user's own layer implementation.
   optional string user_arg = 49;
-  
+
   // to indicate rectangle image data
   optional uint64 height = 50;
   optional uint64 width = 51;
 
   // blank label used in ctc loss
   optional uint32 blank = 52 [default = 0];
+
+  // stride parameter for seqlastins layer, AverageLayer, MaxLayer, which 
+  // controls the scope of pooling operation. can be set > 0.
+  // leave empty or set to -1 to disable this stride pooling.
+  optional int32 seq_pool_stride = 53 [default = -1];
 }
 
 message EvaluatorConfig {
@@ -467,6 +480,10 @@ message EvaluatorConfig {
   // Used by ChunkEvaluator
   // chunk of these types are not counted
   repeated int32 excluded_chunk_types = 12;
+
+  // Used by ClassificationErrorEvaluator
+  // top # classification error
+  optional int32 top_k = 13 [default = 1];
 }
 
 message LinkConfig {
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
new file mode 100644
index 0000000000000000000000000000000000000000..404f9613792653dda72eeb98f022851adedbfbfd
--- /dev/null
+++ b/proto/ParameterServerConfig.proto
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+
+package paddle;
+
+
+/**
+ * Configuration structure for ParameterClient2.
+ */
+message ParameterClientConfig {
+  required int32 trainer_id = 1;
+}
+
+/**
+ * Configuration structure for ParameterServer2.
+ */
+message ParameterServerConfig {
+  // Number of ports for sending dense parameter,
+  // following ports on parameter server will be visited
+  // for sending dense parameter: [port, port+ports_num-1]
+  required int32 ports_num = 1 [default = 1];
+  // Number of ports for sending sparse parameter,
+  // following ports on parameter server will be visited
+  // for sending sparse parameter:
+  // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
+  required int32 ports_num_for_sparse = 2 [default = 0];
+  // network device name for pservers
+  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string rdma_tcp = 4 [default = "tcp"];
+  // Listening port for pserver
+  required int32 port = 5 [default = 20134];
+  // number of gradient servers
+  required int32 num_gradient_servers = 6 [default = 1];
+  // number of threads for sync op exec
+  required int32 pserver_num_threads = 7 [default = 1];
+  // control config_.async_lagged_grad_discard_ratio() min value
+  required double async_lagged_ratio_min = 8 [default = 1.0];
+  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
+  // use it as defalut value
+  required double async_lagged_ratio_default = 9 [default = 1.5];
+}
\ No newline at end of file
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1cda4762eb2a55175d6c9faee98aaeaa1f763890..bfa19d5ecc84a08614852c4c93de5b5793c1be9c 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,11 +4,13 @@ set(OUTPUT_DIR
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 
 set(PY_FILES paddle/__init__.py
              ${TRAINER_PY_FILES}
              ${HELPERS_PY_FILES}
-             ${UTILS_PY_FILES})
+             ${UTILS_PY_FILES}
+             ${V2_PY_FILES})
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -22,6 +24,12 @@ add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
 add_subdirectory(paddle/trainer_config_helpers/tests)
+if (WITH_SWIG_PY)
+  # enable v2 API unittest only when paddle swig api is compiled
+  add_subdirectory(paddle/v2/tests)
+  add_subdirectory(paddle/v2/reader/tests)
+  add_subdirectory(paddle/v2/plot/tests)
+endif()
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index bd24c68b6fe88eab03c814f8cac70db3880316f4..a36f0ebfdcb9f90f54ba2d688f9f4bcee2939ef3 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -45,6 +45,23 @@ class CacheType(object):
 
 
 class InputType(object):
+    """
+    InputType is the base class for paddle input types.
+
+    ..  note::
+
+        this is a base class, and should never be used by user.
+
+    :param dim: dimension of input. If the input is an integer, it means the
+                value range. Otherwise, it means the size of layer.
+    :type dim: int
+    :param seq_type: sequence type of input. 0 means it is not a sequence. 1
+                     means it is a variable length sequence. 2 means it is a
+                     nested sequence.
+    :type seq_type: int
+    :param type: data type of input.
+    :type type: int
+    """
     __slots__ = ['dim', 'seq_type', 'type']
 
     def __init__(self, dim, seq_type, tp):
@@ -54,19 +71,63 @@ class InputType(object):
 
 
 def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Dense Vector. It means the input feature is dense float vector. For example,
+    if the input is an image with 28*28 pixels, the input of Paddle neural
+    network should be a dense vector with dimension 784.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.Dense)
 
 
 def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Sparse binary vector. It means the input feature is a sparse vector and the
+    every element in this vector is either zero or one.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.SparseNonValue)
 
 
 def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Sparse vector. It means the input feature is a sparse vector. Most of the
+    elements in this vector are zero, others could be any float value.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.SparseValue)
 
 
-def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    return InputType(dim, seq_type, DataType.Index)
+def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Data type of integer.
+
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :param value_range: range of this integer.
+    :type value_range: int
+    :return: An input type object
+    :rtype: InputType
+    """
+    return InputType(value_range, seq_type, DataType.Index)
 
 
 dense_vector = dense_slot
@@ -76,6 +137,14 @@ integer_value = index_slot
 
 
 def dense_vector_sequence(dim):
+    """
+    Data type of a sequence of dense vector.
+
+    :param dim: dimension of dense vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -84,6 +153,15 @@ def dense_vector_sub_sequence(dim):
 
 
 def sparse_binary_vector_sequence(dim):
+    """
+    Data type of a sequence of sparse vector, which every element is either zero
+     or one.
+
+    :param dim: dimension of sparse vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -92,6 +170,15 @@ def sparse_binary_vector_sub_sequence(dim):
 
 
 def sparse_vector_sequence(dim):
+    """
+    Data type of a sequence of sparse vector, which most elements are zero,
+    others could be any float value.
+
+    :param dim: dimension of sparse vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -99,8 +186,14 @@ def sparse_vector_sub_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def integer_value_sequence(dim):
-    return integer_value(dim, seq_type=SequenceType.SEQUENCE)
+def integer_value_sequence(value_range):
+    """
+    Data type of a sequence of integer.
+
+    :param value_range: range of each element.
+    :type value_range: int
+    """
+    return integer_value(value_range, seq_type=SequenceType.SEQUENCE)
 
 
 def integer_value_sub_sequence(dim):
@@ -177,7 +270,7 @@ class CheckWrapper(object):
             assert isinstance(each, collections.Sequence)
             for d in each:
                 assert isinstance(d, float)
-            assert len(each, input_type.dim)
+            assert len(each) == input_type.dim
         elif input_type.type == DataType.Index:
             assert isinstance(each, int)
             assert each < input_type.dim
@@ -211,7 +304,7 @@ class CheckInputTypeWrapper(object):
     def __call__(self, obj, filename):
         for items in self.generator(obj, filename):
             try:
-                # dict type is required for input_types when item is dict type 
+                # dict type is required for input_types when item is dict type
                 assert (isinstance(items, dict) and \
                         not isinstance(self.input_types, dict))==False
                 yield items
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 674b5ac58b6febd914cb36c75356d8aa70a908b1..57d30b088b873a94a11483aea536a9e4f6493129 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -138,14 +138,7 @@ def init_config_environment(
         g_root_submodel=None,
         g_submodel_map={},
         g_submodel_stack=[],
-        g_add_submodel_suffix=False,
-
-        # Whether current layer needs to pass the image height and width.
-        # Default value is true, but if it encounters recurrent_layer_group,
-        # it will be false. The reason is that image is converted to be sequence,
-        # image height will be sequence length, and image width will be feature
-        # length of each timestep.
-        g_pass_height_width=True, ):
+        g_add_submodel_suffix=False, ):
 
     for k, v in locals().iteritems():
         globals()[k] = copy.deepcopy(v)
@@ -493,6 +486,7 @@ class Input(Cfg):
             block_expand=None,
             maxout=None,
             spp=None,
+            pad=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -685,25 +679,17 @@ class ContextProjection(Projection):
 
 
 @config_class
-class ConvProjection(Projection):
-    type = 'conv'
-
+class ConvBaseProjection(Projection):
     def __init__(self,
                  input_layer_name,
                  num_filters=None,
                  conv_conf=None,
                  **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+        super(ConvBaseProjection, self).__init__(input_layer_name, **xargs)
 
         if num_filters is not None:
             self.proj_conf.num_filters = num_filters
 
-        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
-                   num_filters)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
-                                     self.proj_conf.conv_conf.output_y * \
-                                     num_filters
-
     def calc_output_size(self, input_layer_config):
         return self.proj_conf.output_size
 
@@ -722,6 +708,48 @@ class ConvProjection(Projection):
         return None
 
 
+@config_class
+class ConvProjection(ConvBaseProjection):
+    type = 'conv'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, num_filters,
+                                             conv_conf, **xargs)
+
+        parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
+                   num_filters)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters
+
+
+@config_class
+class ConvTransProjection(ConvBaseProjection):
+    type = 'convt'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
+                                                  conv_conf, **xargs)
+
+        parse_conv(
+            conv_conf,
+            self.input_layer_name,
+            self.proj_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.img_size_y * \
+                                     self.proj_conf.conv_conf.img_size * \
+                                     num_filters
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -788,6 +816,36 @@ class ConvOperator(Operator):
         return self.operator_conf.output_size
 
 
+@config_class
+class ConvTransOperator(Operator):
+    type = 'convt'
+
+    def __init__(self,
+                 input_layer_names,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransOperator, self).__init__(input_layer_names, **xargs)
+        if num_filters is not None:
+            self.operator_conf.num_filters = num_filters
+
+        parse_conv(
+            conv_conf,
+            MakeLayerNameInSubmodel(input_layer_names[0]),
+            self.operator_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.operator_conf.output_size = \
+            self.operator_conf.conv_conf.img_size * \
+            self.operator_conf.conv_conf.img_size_y * \
+            num_filters
+
+        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
+
+    def calc_output_size(self, input_sizes):
+        return self.operator_conf.output_size
+
+
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Conv(Cfg):
@@ -829,7 +887,6 @@ class Pool(Cfg):
             channels,
             size_x,
             size_y=None,
-            img_width=None,
             start=None,
             stride=None,  # 1 by defalut in protobuf
             stride_y=None,
@@ -844,6 +901,12 @@ class SpatialPyramidPool(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Pad(Cfg):
+    def __init__(self, channels, pad_c, pad_h, pad_w):
+        self.add_keys(locals())
+
+
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -887,11 +950,11 @@ class MaxOut(Cfg):
         self.add_keys(locals())
 
 
-def DataBase(async_load_data=False,
-             constant_slots=None,
-             data_ratio=1,
-             is_main_data=True,
-             usage_ratio=None):
+def create_data_config_proto(async_load_data=False,
+                             constant_slots=None,
+                             data_ratio=1,
+                             is_main_data=True,
+                             usage_ratio=None):
     # default: all sub dataproviders are treat as "main data".
     # see proto/DataConfig.proto for is_main_data
     data_config = DataConfig()
@@ -917,7 +980,7 @@ def SimpleData(files=None,
                context_len=None,
                buffer_capacity=None,
                **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = 'simple'
     data_config.files = files
     data_config.feat_dim = feat_dim
@@ -939,7 +1002,7 @@ def PyData(files=None,
            constant_slots=None,
            load_thread_num=None,
            **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = 'py'
     if load_data_module in g_py_module_name_list:
 
@@ -990,7 +1053,7 @@ def ProtoData(files=None,
               constant_slots=None,
               load_thread_num=None,
               **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     if type is None:
         data_config.type = 'proto'
     else:
@@ -1029,7 +1092,7 @@ def Data(type,
          buffer_capacity=None,
          **xargs):
 
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = type
     data_config.files = files
     data_config.feat_dim = feat_dim
@@ -1102,7 +1165,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1127,10 +1190,10 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
     pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
                                          pool_conf.padding, pool_conf.stride,
-                                         False)
+                                         not ceil_mode)
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
-                                         pool_conf.stride_y, False)
+                                         pool_conf.stride_y, not ceil_mode)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
@@ -1150,9 +1213,11 @@ def parse_image(image, input_layer_name, image_conf):
 
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
-                  norm.norm_type)
+    config_assert(
+        norm.norm_type in
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
     norm_conf.channels = norm.channels
     norm_conf.size = norm.size
     norm_conf.scale = norm.scale
@@ -1247,6 +1312,7 @@ def Evaluator(
         dict_file=None,
         result_file=None,
         num_results=None,
+        top_k=None,
         delimited=None,
         excluded_chunk_types=None, ):
     evaluator = g_config.model_config.evaluators.add()
@@ -1274,6 +1340,8 @@ def Evaluator(
         evaluator.result_file = result_file
     if num_results is not None:
         evaluator.num_results = num_results
+    if top_k is not None:
+        evaluator.top_k = top_k
     if delimited is not None:
         evaluator.delimited = delimited
 
@@ -1362,12 +1430,6 @@ class LayerBase(object):
 
         g_current_submodel.layer_names.append(self.config.name)
 
-        if self.config.type != 'data' and g_pass_height_width:
-            height = self.get_input_layer(0).height
-            width = self.get_input_layer(0).width
-            if height and width:
-                self.set_layer_height_width(height, width)
-
     def get_input_layer(self, input_index):
         return g_layer_map[self.config.inputs[input_index].input_layer_name]
 
@@ -1763,8 +1825,17 @@ class ConvTransLayerBase(LayerBase):
         use_gpu = int(g_command_config_args.get("use_gpu", 0))
         parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
 
-        # cudnn_convt has not been implemented so use exconvt only
-        self.layer_type = "exconvt"
+        # Automatically select cudnn_type for GPU and exconvt for CPU
+        # if set type=exconvt, but still reserve the way user specify
+        # exconvt or cudnn_convt manually.
+        if self.layer_type == "cudnn_convt":
+            config_assert(use_gpu, "cudnn_convt only support GPU")
+
+        if (use_gpu == 1 and self.layer_type != "exconvt" and
+            (parallel_nn == 0 or self.config.device > -1)):
+            self.layer_type = "cudnn_convt"
+        else:
+            self.layer_type = "exconvt"
         # need to specify layer in config
         self.config.type = self.layer_type
 
@@ -1781,10 +1852,9 @@ class ConvTransLayerBase(LayerBase):
                 trans=True)
             conv_conf = self.config.inputs[input_index].conv_conf
             psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
             self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
-                (conv_conf.img_size**2) * self.config.num_filters)
+            self.set_cnn_layer(name, conv_conf.img_size_y, conv_conf.img_size,
+                               self.config.num_filters)
 
         psize = self.config.size
         if shared_biases:
@@ -1801,11 +1871,15 @@ class ConvTransLayer(ConvTransLayerBase):
     layer_type = 'exconvt'
 
 
+@config_layer('cudnn_convt')
+class ConvTransLayer(ConvTransLayerBase):
+    layer_type = 'cudnn_convt'
+
+
 @config_layer('norm')
 class NormLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(NormLayer, self).__init__(
-            name, 'norm', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, **xargs):
+        super(NormLayer, self).__init__(name, 'norm', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             norm_conf = self.config.inputs[input_index].norm_conf
@@ -1813,27 +1887,29 @@ class NormLayer(LayerBase):
                        norm_conf)
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 
 
 @config_layer('pool')
 class PoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
-        super(PoolLayer, self).__init__(
-            name, 'pool', 0, inputs=inputs, device=device)
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
+        super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf)
+                       pool_conf, ceil_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
 
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(SpatialPyramidPoolLayer, self).__init__(
-            name, 'spp', 0, inputs=inputs, device=device)
+            name, 'spp', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             spp_conf = self.config.inputs[input_index].spp_conf
@@ -1842,6 +1918,25 @@ class SpatialPyramidPoolLayer(LayerBase):
             self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
+@config_layer('pad')
+class PadLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(PadLayer, self).__init__(name, 'pad', 0, inputs=inputs, **xargs)
+        pad = self.inputs[0].pad
+        self.config.inputs[0].pad_conf.pad_c.extend(pad.pad_c)
+        self.config.inputs[0].pad_conf.pad_h.extend(pad.pad_h)
+        self.config.inputs[0].pad_conf.pad_w.extend(pad.pad_w)
+
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].pad_conf.image_conf
+        parse_image(pad, input_layer.name, image_conf)
+        out_ch = pad.channels + pad.pad_c[0] + pad.pad_c[1]
+        out_h = image_conf.img_size_y + pad.pad_h[0] + pad.pad_h[1]
+        out_w = image_conf.img_size + pad.pad_w[0] + pad.pad_w[1]
+        self.set_cnn_layer(name, out_h, out_w, out_ch)
+        self.config.size = out_ch * out_h * out_w
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -1851,7 +1946,6 @@ class BatchNormLayer(LayerBase):
                  inputs,
                  active_type="linear",
                  bias=True,
-                 device=None,
                  use_global_stats=True,
                  moving_average_fraction=0.9,
                  batch_norm_type=None,
@@ -1893,7 +1987,6 @@ class BatchNormLayer(LayerBase):
             0,
             active_type=active_type,
             inputs=inputs,
-            device=device,
             **xargs)
 
         if use_global_stats is not None:
@@ -1905,8 +1998,8 @@ class BatchNormLayer(LayerBase):
         image_conf = self.config.inputs[0].image_conf
         parse_image(self.inputs[0].image, input_layer.name, image_conf)
 
-        # Only pass the width and height of input to batch_norm layer 
-        # when either of it is non-zero. 
+        # Only pass the width and height of input to batch_norm layer
+        # when either of it is non-zero.
         if input_layer.width != 0 or input_layer.height != 0:
             self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
                                image_conf.channels, False)
@@ -1927,9 +2020,9 @@ class BatchNormLayer(LayerBase):
 
 @config_layer('trans')
 class TransLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(TransLayer, self).__init__(
-            name, 'trans', 0, inputs=inputs, device=device)
+            name, 'trans', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
             'TransLayer must have one and only one input')
@@ -1938,19 +2031,31 @@ class TransLayer(LayerBase):
 
 @config_layer('resize')
 class ResizeLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None):
+    def __init__(self, name, size, inputs, **xargs):
         super(ResizeLayer, self).__init__(
-            name, 'resize', size=size, inputs=inputs, device=device)
+            name, 'resize', size=size, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 1,
             'ResizeLayer must have one and only one input')
 
 
+@config_layer('rotate')
+class RotateLayer(LayerBase):
+    def __init__(self, name, inputs, height, width, device=None):
+        super(RotateLayer, self).__init__(
+            name, 'rotate', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1,
+            'RotateLayer must have one and only one input')
+        self.set_layer_height_width(height, width)
+        self.set_layer_size(self.get_input_layer(0).size)
+
+
 @config_layer('blockexpand')
 class BlockExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(BlockExpandLayer, self).__init__(
-            name, 'blockexpand', 0, inputs=inputs, device=device)
+            name, 'blockexpand', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             parse_block_expand(
@@ -2001,6 +2106,7 @@ define_cost('MultiBinaryLabelCrossEntropy', 'multi_binary_label_cross_entropy')
 define_cost('SoftBinaryClassCrossEntropy', 'soft_binary_class_cross_entropy')
 define_cost('HuberTwoClass', 'huber')
 define_cost('SumCost', 'sum_cost')
+define_cost('SmoothL1Cost', 'smooth_l1')
 
 
 @config_layer('hsigmoid')
@@ -2186,7 +2292,10 @@ def Link(
 
 # memory for recurrent layer group.
 # *name* and *size* are actual layer's name and size.
-# will return name of the memory,
+# If *name* is None, need to provide *memory_name* and need to use
+# SetMemoryInput() later to specify the layer which this memory remembers.
+#
+# return the name of the memory,
 # use this name if you assign the memory as other layer's input
 #
 # boot frame of memory is zeroed by default,
@@ -2198,15 +2307,18 @@ def Link(
 # can only be initailized by a *boot_layer* which is a sequence.
 #
 @config_func
-def Memory(
-        name,
-        size,
-        is_sequence=False,
-        boot_layer=None,
-        boot_bias=False,
-        boot_bias_active_type="",
-        boot_with_const_id=None, ):
-    agent_name = name + "+delay1"
+def Memory(name,
+           size,
+           is_sequence=False,
+           boot_layer=None,
+           boot_bias=False,
+           boot_bias_active_type="",
+           boot_with_const_id=None,
+           memory_name=None):
+    if not memory_name:
+        config_assert(name is not None, "name needs cannot be None")
+        memory_name = name + "+delay1"
+    agent_name = memory_name
     if is_sequence:
         agent_layer = SequenceAgentLayer(agent_name, size)
     else:
@@ -2214,7 +2326,8 @@ def Memory(
     config_assert(g_current_submodel.is_recurrent_layer_group,
                   'Memory should be used in recurrent layer group only')
     memory = g_current_submodel.memories.add()
-    memory.layer_name = MakeLayerNameInSubmodel(name)
+    if name is not None:
+        memory.layer_name = MakeLayerNameInSubmodel(name)
     memory.link_name = MakeLayerNameInSubmodel(agent_name)
     memory.is_sequence = is_sequence
     options = sum((boot_layer is not None, bool(boot_bias),
@@ -2238,6 +2351,17 @@ def Memory(
     return agent_name
 
 
+@config_func
+def SetMemoryInput(memory_name, layer_name):
+    memory_name = MakeLayerNameInSubmodel(memory_name)
+    layer_name = MakeLayerNameInSubmodel(layer_name)
+    for mem in g_current_submodel.memories:
+        if mem.link_name == memory_name:
+            mem.layer_name = layer_name
+            return
+    logger.fatal("Nonexistent memory name: " + memory_name)
+
+
 # Generator for recurrent layer group, to use it:
 #  1. define a id layer as output of layer group
 #  2. define a memory of this id layer, and assign a boot id(begin of sequence)
@@ -2265,14 +2389,9 @@ def Generator(
 
 @config_layer('expand')
 class ExpandLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, trans_type='non-seq', bias=False, **xargs):
         super(ExpandLayer, self).__init__(
-            name, 'expand', 0, inputs=inputs, device=device)
+            name, 'expand', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 2, 'ExpandLayer takes 2 and only 2 inputs')
         self.config.trans_type = trans_type
@@ -2303,11 +2422,10 @@ class MaxLayer(LayerBase):
                  inputs,
                  trans_type='non-seq',
                  active_type='linear',
-                 device=None,
                  bias=False,
-                 output_max_index=None):
-        super(MaxLayer, self).__init__(
-            name, 'max', 0, inputs=inputs, device=device)
+                 output_max_index=None,
+                 **xargs):
+        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
         self.config.trans_type = trans_type
         self.config.active_type = active_type
@@ -2354,59 +2472,57 @@ class SequenceLastInstanceLayer(LayerBase):
                  inputs,
                  active_type='linear',
                  trans_type='non-seq',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 stride=-1,
+                 **xargs):
         super(SequenceLastInstanceLayer, self).__init__(
             name,
             'seqlastins',
             0,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
+        if trans_type == 'seq':
+            config_assert(stride == -1, 'subseq does not support stride window')
         self.config.trans_type = trans_type
-        for input_index in xrange(len(self.inputs)):
-            input_layer = self.get_input_layer(input_index)
-            self.set_layer_size(input_layer.size)
+        self.config.seq_pool_stride = stride
+        self.set_layer_size(self.get_input_layer(0).size)
         self.create_bias_parameter(bias, self.config.size)
 
 
 @config_layer('seqfirstins')
 class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
-    def __init__(
-            self,
-            name,
-            inputs,
-            active_type='linear',
-            trans_type='non-seq',
-            device=None,
-            bias=False, ):
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type='linear',
+                 trans_type='non-seq',
+                 bias=False,
+                 stride=-1,
+                 **xargs):
         super(SequenceFirstInstanceLayer, self).__init__(
             name,
             inputs=inputs,
             active_type=active_type,
-            device=device,
-            bias=bias)
-        self.config.trans_type = trans_type
+            trans_type=trans_type,
+            bias=bias,
+            stride=stride,
+            **xargs)
         self.config.select_first = True
 
 
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 active_type='linear',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
         super(SequenceConcatLayer, self).__init__(
             name,
             'seqconcat',
             0,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
         for input_index in xrange(len(self.inputs)):
@@ -2422,15 +2538,15 @@ class SequenceReshapeLayer(LayerBase):
                  size,
                  inputs,
                  active_type='linear',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(SequenceReshapeLayer, self).__init__(
             name,
             'seqreshape',
             size,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
         self.set_layer_size(size)
@@ -2439,19 +2555,9 @@ class SequenceReshapeLayer(LayerBase):
 
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 active_type='linear',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
         super(SubSequenceLayer, self).__init__(
-            name,
-            'subseq',
-            0,
-            inputs=inputs,
-            device=device,
-            active_type=active_type)
+            name, 'subseq', 0, inputs=inputs, active_type=active_type, **xargs)
         config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         size = input_layer0.size
@@ -2608,15 +2714,10 @@ class AverageLayer(LayerBase):
                  average_strategy='average',
                  trans_type='non-seq',
                  active_type='linear',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(AverageLayer, self).__init__(
-            name,
-            'average',
-            0,
-            inputs=inputs,
-            device=device,
-            active_type=active_type)
+            name, 'average', 0, inputs=inputs, active_type=active_type, **xargs)
         self.config.average_strategy = average_strategy
         self.config.trans_type = trans_type
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
@@ -2628,7 +2729,7 @@ class AverageLayer(LayerBase):
 
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=5, device=None):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
@@ -2640,9 +2741,9 @@ class CosSimLayer(LayerBase):
 
 @config_layer('tensor')
 class TensorLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None, bias=True, **xargs):
+    def __init__(self, name, size, inputs, bias=True, **xargs):
         super(TensorLayer, self).__init__(
-            name, 'tensor', size, inputs=inputs, device=device, **xargs)
+            name, 'tensor', size, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
         config_assert(size > 0, 'size must be positive')
         config_assert(inputs[1].parameter_name == None,
@@ -2993,7 +3094,7 @@ class CRFLayer(LayerBase):
         super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
         config_assert(2 <= len(self.inputs) <= 3,
                       'CRFLayer must have 2 or 3 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
         self.config.coeff = coeff
 
 
@@ -3015,7 +3116,7 @@ class CRFDecodingLayer(LayerBase):
         config_assert(
             len(self.inputs) <= 2,
             'CRFDecodingLayer cannot have more than 2 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
 
 
 @config_layer('ctc')
@@ -3050,8 +3151,6 @@ class WarpCTCLayer(LayerBase):
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
     def __init__(self, name, device=None):
-        global g_pass_height_width
-        g_pass_height_width = False
         super(RecurrentLayerGroup, self).__init__(
             name, 'recurrent_layer_group', 0, inputs=[], device=device)
 
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
old mode 100644
new mode 100755
index edca279dcadef42243cb3fc00366cec90cbc69bf..ef92107a1093d2ec2b2a41677e964fdaa60ac829
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@@ -15,10 +15,13 @@
 # recurrent_units.py
 # Version 2.0
 #
-# Some recurrent units can be used in recurrent layer group, 
+# Some recurrent units can be used in recurrent layer group,
 #   to use these units, import this module in your config_file:
-#     import trainer.recurrent_units 
-# 
+#     import trainer.recurrent_units
+#
+# The modules in this file are DEPRECATED.
+# If you would like to use lstm/gru
+# please use the functions defined in paddle.trainer_config_helpers.
 
 from paddle.trainer.config_parser import *
 
@@ -26,7 +29,7 @@ from paddle.trainer.config_parser import *
 # long short term memory, can be used in recurrent machine
 # *inputs* must be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two LstmRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 def LstmRecurrentUnit(name,
@@ -194,7 +197,7 @@ def LstmRecurrentLayerGroup(name,
 # gated recurrent unit, can be used in recurrent machine
 # *inputs* should be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two GatedRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index bf0208834600fef3bcf1b0496da8f5f77aea44c5..7ae9e5cb3050fa6f70fa84785a1ddbdc68c70235 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -196,7 +196,7 @@ class ExtraLayerAttribute(object):
                       <https://www.cs.toronto.edu/~hinton/absps/
                       JMLRdropout.pdf>`_.
     :type drop_rate: float
-    :param device: device ID of layer. device=-1, use CPU. device>0, use GPU.
+    :param device: device ID of layer. device=-1, use CPU. device>=0, use GPU.
                    The details allocation in parallel_nn please refer to `here
                    <http://www.paddlepaddle.org/doc/ui/cmd_argument/
                    use_case.html#case-2-specify-layers-in-different-devices>`_.
@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object):
                  drop_rate=None,
                  device=None):
         self.attr = dict()
-        if isinstance(error_clipping_threshold, float):
-            assert error_clipping_threshold > 0
-            self.attr["error_clipping_threshold"] = error_clipping_threshold
-
-        if isinstance(drop_rate, float):
-            assert drop_rate > 0
+        if error_clipping_threshold is not None:
+            error_clipping_threshold = float(error_clipping_threshold)
+            if error_clipping_threshold < 0:
+                raise ValueError("Error clipping must > 0")
+            self.attr['error_clipping_threshold'] = error_clipping_threshold
+        if drop_rate is not None:
+            drop_rate = float(drop_rate)
+            if drop_rate < 0:
+                raise ValueError("Dropout rate must > 0")
             self.attr["drop_rate"] = drop_rate
 
         if isinstance(device, int):
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 622b4fc25ccff397cd3115db316870f328466fba..ab9a2562dcccb394c0b24741ceeb10061e40cb9a 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -58,8 +58,8 @@ def define_py_data_source(file_list,
     :param obj: python object name. May be a function name if using
                 PyDataProviderWrapper.
     :type obj: basestring
-    :param args: The best practice is using dict to pass arguments into 
-                 DataProvider, and use :code:`@init_hook_wrapper` to 
+    :param args: The best practice is using dict to pass arguments into
+                 DataProvider, and use :code:`@init_hook_wrapper` to
                  receive arguments.
     :type args: string or picklable object
     :param async: Load Data asynchronously or not.
@@ -98,7 +98,7 @@ def define_py_data_sources(train_list,
     The annotation is almost the same as define_py_data_sources2, except that
     it can specific train_async and data_cls.
 
-    :param data_cls: 
+    :param data_cls:
     :param train_list: Train list name.
     :type train_list: basestring
     :param test_list: Test list name.
@@ -111,8 +111,8 @@ def define_py_data_sources(train_list,
                 a tuple or list to this argument.
     :type obj: basestring or tuple or list
     :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
-                 arguments. If train and test is different, then pass a tuple 
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive
+                 arguments. If train and test is different, then pass a tuple
                  or list to this argument.
     :type args: string or picklable object or list or tuple.
     :param train_async: Is training data load asynchronously or not.
@@ -163,12 +163,12 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
 
     ..  code-block:: python
 
-        define_py_data_sources2(train_list="train.list", 
-                                test_list="test.list", 
+        define_py_data_sources2(train_list="train.list",
+                                test_list="test.list",
                                 module="data_provider"
                                 # if train/test use different configurations,
                                 # obj=["process_train", "process_test"]
-                                obj="process", 
+                                obj="process",
                                 args={"dictionary": dict_name})
 
     The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
@@ -185,8 +185,8 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
                 a tuple or list to this argument.
     :type obj: basestring or tuple or list
     :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
-                 arguments. If train and test is different, then pass a tuple 
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive
+                 arguments. If train and test is different, then pass a tuple
                  or list to this argument.
     :type args: string or picklable object or list or tuple.
     :return: None
@@ -195,13 +195,13 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
 
     def py_data2(files, load_data_module, load_data_object, load_data_args,
                  **kwargs):
-        data = DataBase()
+        data = create_data_config_proto()
         data.type = 'py2'
         data.files = files
         data.load_data_module = load_data_module
         data.load_data_object = load_data_object
         data.load_data_args = load_data_args
-        data.async_load_data = True
+        data.async_load_data = False
         return data
 
     define_py_data_sources(
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index ad3efcbf369411b9c42b2a32ed05b04f86bf7de6..69d860d9dab9c1d90e4d6a6940d66fcb551f6eb6 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -52,6 +52,10 @@ def wrap_param_default(param_names=None,
                     kwargs[name] = default_factory(func)
             return func(*args, **kwargs)
 
+        if hasattr(func, 'argspec'):
+            __wrapper__.argspec = func.argspec
+        else:
+            __wrapper__.argspec = inspect.getargspec(func)
         return __wrapper__
 
     return __impl__
@@ -93,13 +97,13 @@ def reset_hook():
 register_parse_config_hook(reset_hook)
 
 
-def wrap_name_default(name_prefix=None):
+def wrap_name_default(name_prefix=None, name_param="name"):
     """
     Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
 
     ..  code:: python
 
-        @default_name("some_name")
+        @wrap_name_default("some_name")
         def func(name=None):
             print name      # name will never be None. If name is not set,
                             # name will be "some_name_%d"
@@ -111,7 +115,7 @@ def wrap_name_default(name_prefix=None):
     """
     factory = DefaultNameFactory(name_prefix)
     _name_factories.append(factory)
-    return wrap_param_default(["name"], factory)
+    return wrap_param_default([name_param], factory)
 
 
 def wrap_param_attr_default(param_names=None, default_factory=None):
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index bd247ea9af9d8dfb2d476cdc62638bd65c11add5..567521ee9dbadb7a2502cfb9972ef0940e1e410a 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -71,6 +71,7 @@ def evaluator_base(
         result_file=None,
         num_results=None,
         delimited=None,
+        top_k=None,
         excluded_chunk_types=None, ):
     """
     Evaluator will evaluate the network status while training/testing.
@@ -104,12 +105,15 @@ def evaluator_base(
     :param weight: An input layer which is a weight for each sample.
                    Each evaluator may calculate differently to use this weight.
     :type weight: LayerOutput.
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
         classification_threshold, float)
     assert positive_label is None or isinstance(positive_label, int)
     assert num_results is None or isinstance(num_results, int)
+    assert top_k is None or isinstance(top_k, int)
 
     if not isinstance(input, list):
         input = [input]
@@ -130,6 +134,8 @@ def evaluator_base(
         dict_file=dict_file,
         result_file=result_file,
         delimited=delimited,
+        num_results=num_results,
+        top_k=top_k,
         excluded_chunk_types=excluded_chunk_types, )
 
 
@@ -139,6 +145,7 @@ def classification_error_evaluator(input,
                                    label,
                                    name=None,
                                    weight=None,
+                                   top_k=None,
                                    threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
@@ -167,6 +174,8 @@ def classification_error_evaluator(input,
                   then means not set weight. The larger weight it is, the more
                   important this sample is.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param threshold: The classification threshold.
     :type threshold: float
     :return: None.
@@ -178,6 +187,7 @@ def classification_error_evaluator(input,
         input=input,
         label=label,
         weight=weight,
+        top_k=top_k,
         classification_threshold=threshold, )
 
 
diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py
index 2d9e36f2b0d379d907634208a45c69efa9dbba3d..544b443825393c9a31c0375724d4ca63dac5c5eb 100644
--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ b/python/paddle/trainer_config_helpers/layer_math.py
@@ -39,6 +39,7 @@ register_unary_math_op('abs', act.AbsActivation())
 register_unary_math_op('sigmoid', act.SigmoidActivation())
 register_unary_math_op('tanh', act.TanhActivation())
 register_unary_math_op('square', act.SquareActivation())
+register_unary_math_op('relu', act.ReluActivation())
 
 
 def add(layeroutput, other):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
old mode 100644
new mode 100755
index 9b6e5774bc82dc05e14a2565fa9cce98764adf04..2af7c9c9c4487b6123ffe91dd1f920aad31df763
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -14,10 +14,11 @@
 
 import functools
 import collections
+import inspect
 
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
-    ReluActivation, IdentityActivation, SoftmaxActivation
+    ReluActivation, IdentityActivation, SoftmaxActivation, BaseActivation
 from .evaluators import *
 from .poolings import MaxPooling, AvgPooling, BasePoolingType
 from .attrs import *
@@ -37,6 +38,7 @@ __all__ = [
     "dotmul_projection",
     "dotmul_operator",
     "repeat_layer",
+    "seq_reshape_layer",
     "table_projection",
     "mixed_layer",
     "data_layer",
@@ -50,6 +52,7 @@ __all__ = [
     "cos_sim",
     "hsigmoid",
     "conv_projection",
+    "mse_cost",
     "regression_cost",
     'classification_cost',
     "LayerOutput",
@@ -59,6 +62,7 @@ __all__ = [
     'img_cmrnorm_layer',
     'addto_layer',
     'concat_layer',
+    'seq_concat_layer',
     'lstm_step_layer',
     'recurrent_group',
     'memory',
@@ -70,6 +74,7 @@ __all__ = [
     'interpolation_layer',
     'bilinear_interp_layer',
     'trans_layer',
+    'rotate_layer',
     'sum_to_one_norm_layer',
     'get_output_layer',
     'LayerType',
@@ -79,6 +84,7 @@ __all__ = [
     'GeneratedInput',
     'SubsequenceInput',
     'gru_step_layer',
+    'gru_step_naive_layer',
     'recurrent_layer',
     'BaseGeneratedInput',
     'conv_operator',
@@ -107,7 +113,12 @@ __all__ = [
     'out_prod_layer',
     'print_layer',
     'priorbox_layer',
+    'cross_channel_norm_layer',
     'spp_layer',
+    'pad_layer',
+    'eos_layer',
+    'smooth_l1_cost',
+    'layer_support',
 ]
 
 
@@ -122,6 +133,7 @@ class LayerType(object):
     GRUMEMORY = "gated_recurrent"
     SEQUENCE_LAST_INSTANCE = "seqlastins"
     SEQUENCE_FIRST_INSTANCE = "seqfirstins"
+    SEQUENCE_RESHAPE = "seqreshape"
     POOLING_MAX = "max"
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
@@ -142,6 +154,7 @@ class LayerType(object):
 
     CONCAT_LAYER = 'concat'
     CONCAT_PROJ_LAYER = 'concat2'
+    SEQUENCE_CONCAT_LAYER = 'seqconcat'
 
     LSTM_STEP_LAYER = 'lstm_step'
     GRU_STEP_LAYER = 'gru_step'
@@ -153,6 +166,7 @@ class LayerType(object):
     POWER_LAYER = 'power'
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
+    ROTATE_LAYER = 'rotate'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -170,6 +184,7 @@ class LayerType(object):
     BLOCK_EXPAND = "blockexpand"
     MAXOUT = "maxout"
     SPP_LAYER = "spp"
+    PAD_LAYER = "pad"
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
@@ -188,6 +203,7 @@ class LayerType(object):
     SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
     MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
     SUM_COST = "sum_cost"
+    SMOOTH_L1 = "smooth_l1"
 
     @staticmethod
     def is_layer_type(type_name):
@@ -276,6 +292,14 @@ class LayerOutput(object):
         """
         assert False, "this method should not be invoked"
 
+    def set_input(self, input):
+        """
+        Set the input for a memory layer. Can only be used for memory layer
+        """
+        assert isinstance(input, LayerOutput)
+        assert self.layer_type == LayerType.MEMORY
+        SetMemoryInput(self.name, input.name)
+
 
 ERROR_CLIPPING = 'error_clipping_threshold'
 DROPOUT = 'drop_rate'
@@ -306,6 +330,11 @@ def layer_support(*attrs):
                     val.check(method.__name__)
             return method(*args, **kwargs)
 
+        if hasattr(method, 'argspec'):
+            wrapper.argspec = method.argspec
+        else:
+            wrapper.argspec = inspect.getargspec(method)
+
         return wrapper
 
     return decorator
@@ -541,7 +570,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
     DotMulOperator takes two inputs and performs element-wise multiplication:
 
     .. math::
-       out.row[i] += scale * (x.row[i] .* y.row[i])
+       out.row[i] += scale * (a.row[i] .* b.row[i])
 
     where :math:`.*` means element-wise multiplication, and
     scale is a config scalar, its default value is one.
@@ -550,7 +579,7 @@ def dotmul_operator(a=None, b=None, scale=1, **kwargs):
 
     .. code-block:: python
 
-       op = dotmul_operator(x=layer1, y=layer2, scale=0.5)
+       op = dotmul_operator(a=layer1, b=layer2, scale=0.5)
 
     :param a: Input layer1
     :type a: LayerOutput
@@ -687,8 +716,9 @@ class MixedLayerType(LayerOutput):
         assert len(self.inputs) == 0
         return self
 
-    def __exit__(self, *args, **kwargs):
-        del args, kwargs  # unused parameter to suppress warning
+    def __exit__(self, exc_type, exc_value, tb):
+        if exc_value is not None:
+            raise exc_value
         assert len(self.inputs) != 0
         ml = MixedLayer(
             name=self.name,
@@ -700,6 +730,7 @@ class MixedLayerType(LayerOutput):
         # update the size which might be computed inside MixedLayer
         # according to the operator's output size
         self.size = ml.config.size
+        self.finalized = True
 
 
 @wrap_name_default("mixed")
@@ -778,17 +809,16 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
     ..  code-block:: python
 
-        data = data_layer(name="input",
-                          size=1000)
+        data = data_layer(name="input", size=1000)
 
     :param name: Name of this data layer.
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
     :param height: Height of this data layer, used for image
-    :type size: int|None
+    :type height: int|None
     :param width: Width of this data layer, used for image
-    :type size: int|None
+    :type width: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -982,6 +1012,46 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("cross_channel_norm")
+def cross_channel_norm_layer(input, name=None, param_attr=None):
+    """
+    Normalize a layer's output. This layer is necessary for ssd.
+    This layer applys normalize across the channels of each sample to
+    a conv layer's output and scale the output by a group of trainable
+    factors which dimensions equal to the channel's number.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
+    :return: LayerOutput
+    """
+    assert input.num_filters is not None
+    Layer(
+        name=name,
+        type=LayerType.NORM_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
+                    size=input.size,
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
+    return LayerOutput(
+        name,
+        LayerType.NORM_LAYER,
+        parents=input,
+        num_filters=input.num_filters,
+        size=input.size)
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -1275,15 +1345,29 @@ def grumemory(input,
 def last_seq(input,
              name=None,
              agg_level=AggregateLevel.EACH_TIMESTEP,
+             stride=-1,
              layer_attr=None):
     """
     Get Last Timestamp Activation of a sequence.
 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the last value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
+    of stride is -1.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = last_seq(input=layer)
+
     :param agg_level: Aggregated level
     :param name: Layer name.
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
+    :param stride: window size.
+    :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -1295,11 +1379,15 @@ def last_seq(input,
                        " series information at all. Maybe you want to use"
                        " first_seq instead.")
 
+    if agg_level == AggregateLevel.EACH_SEQUENCE:
+        assert stride == -1
+
     Layer(
         name=name,
         type=LayerType.SEQUENCE_LAST_INSTANCE,
         inputs=[input.name],
         trans_type=agg_level,
+        stride=stride,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -1313,15 +1401,29 @@ def last_seq(input,
 def first_seq(input,
               name=None,
               agg_level=AggregateLevel.EACH_TIMESTEP,
+              stride=-1,
               layer_attr=None):
     """
     Get First Timestamp Activation of a sequence.
 
+    If stride > 0, this layer slides a window whose size is determined by stride,
+    and return the first value of the window as the output. Thus, a long sequence
+    will be shorten. Note that for sequence with sub-sequence, the default value
+    of stride is -1.
+
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = first_seq(input=layer)
+
     :param agg_level: aggregation level
     :param name: Layer name.
     :type name: basestring
     :param input: Input layer name.
     :type input: LayerOutput
+    :param stride: window size.
+    :type stride: Int
     :param layer_attr: extra layer attributes.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -1334,11 +1436,15 @@ def first_seq(input,
                        ' time series information at all. Maybe you want to use'
                        ' last_seq instead.')
 
+    if agg_level == AggregateLevel.EACH_SEQUENCE:
+        assert stride == -1
+
     Layer(
         name=name,
         type=LayerType.SEQUENCE_FIRST_INSTANCE,
         inputs=[input.name],
         trans_type=agg_level,
+        stride=stride,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -1417,7 +1523,7 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       expand = repeat_layer(layer, 4)
+       expand = repeat_layer(input=layer, num_repeats=4)
 
     :param input: Input layer
     :type input: LayerOutput
@@ -1444,6 +1550,61 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
         parents=[input])
 
 
+@wrap_name_default("seqreshape")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_reshape_layer(input,
+                      reshape_size,
+                      act=None,
+                      name=None,
+                      layer_attr=None,
+                      bias_attr=None):
+    """
+    A layer for reshaping the sequence. Assume the input sequence has T instances,
+    the dimension of each instance is M, and the input reshape_size is N, then the
+    output sequence has T*M/N instances, the dimension of each instance is N.
+
+    Note that T*M/N must be an integer.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       reshape = seq_reshape_layer(input=layer, reshape_size=4)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param reshape_size: the size of reshaped sequence.
+    :type reshape_size: int
+    :param name: Layer name.
+    :type name: basestring
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    Layer(
+        inputs=[input.name],
+        name=name,
+        size=reshape_size,
+        type=LayerType.SEQUENCE_RESHAPE,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        size=reshape_size,
+        layer_type=LayerType.SEQUENCE_RESHAPE,
+        parents=[input])
+
+
 @wrap_name_default()
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
@@ -1640,7 +1801,7 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def trans_layer(input, name=None, layer_attr=None):
     """
-    A layer for transposition.
+    A layer for transposing a minibatch matrix.
 
     .. math::
        y = x^\mathrm{T}
@@ -1673,7 +1834,53 @@ def trans_layer(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+def rotate_layer(input, height, width, name=None, layer_attr=None):
+    """
+    A layer for rotating 90 degrees (clock-wise) for each feature channel,
+    usually used when the input sample is some image or feature map.
+
+    .. math::
+       y(j,i,:) = x(M-i-1,j,:)
+
+    where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       rot = rotate_layer(input=layer,
+                          height=100,
+                          width=100)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param height: The height of the sample matrix
+    :type height: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    l = Layer(
+        name=name,
+        height=height,
+        width=width,
+        type=LayerType.ROTATE_LAYER,
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.ROTATE_LAYER,
+        parents=[input],
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     """
     Cosine Similarity Layer. The cosine similarity equation is here.
 
@@ -1688,6 +1895,12 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     Note that the above computation is for one sample. Multiple samples are
     processed in one batch.
 
+    The example usage is:
+
+    .. code-block:: python
+
+       cos = cos_sim(a=layer1, b=layer2, size=3)
+
     :param name: layer name
     :type name: basestring
     :param a: input layer a
@@ -1730,7 +1943,7 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
 @layer_support()
 def hsigmoid(input,
              label,
-             num_classes,
+             num_classes=None,
              name=None,
              bias_attr=None,
              param_attr=None,
@@ -1746,8 +1959,7 @@ def hsigmoid(input,
     ..  code-block:: python
 
         cost = hsigmoid(input=[layer1, layer2],
-                        label=data_layer,
-                        num_classes=3)
+                        label=data_layer)
 
     :param input: Input layers. It could be a LayerOutput or list/tuple of
                  LayerOutput.
@@ -1755,12 +1967,14 @@ def hsigmoid(input,
     :param label: Label layer.
     :type label: LayerOutput
     :param num_classes: number of classes.
-    :type num_classes: int
+    :type num_classes: int|None
     :param name: layer name
     :type name: basestring
     :param bias_attr: Bias attribute. None means default bias.
                       False means no bias.
     :type bias_attr: ParameterAttribute|False
+    :param param_attr: Parameter Attribute. None means default parameter.
+    :type param_attr: ParameterAttribute|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
@@ -1780,6 +1994,11 @@ def hsigmoid(input,
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
 
+    if num_classes is None:
+        num_classes = label.size
+    if num_classes is None or num_classes <= 2:
+        raise ValueError("hsigmoid label size must larger than 2.")
+
     ipts_for_layer = []
     parents = []
     for each_input, each_param_attr in zip(input, param_attr):
@@ -1824,14 +2043,14 @@ def img_conv_layer(input,
                    trans=False,
                    layer_type=None):
     """
-    Convolution layer for image. Paddle can support both square and non-square 
+    Convolution layer for image. Paddle can support both square and non-square
     input currently.
 
     The details of convolution layer, please refer UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/
     FeatureExtractionUsingConvolution/>`_ .
 
-    Convolution Transpose (deconv) layer for image. Paddle can support both square 
+    Convolution Transpose (deconv) layer for image. Paddle can support both square
     and non-square input currently.
 
     The details of convolution transpose layer,
@@ -1849,6 +2068,16 @@ def img_conv_layer(input,
     pieces. First 256/4 = 64 channels will process by first 32 filters. The
     rest channels will be processed by rest group of filters.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
     :param name: Layer name.
     :type name: basestring
     :param input: Layer Input.
@@ -1890,8 +2119,9 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type 
-                       has to be either "exconv" or "cudnn_conv"
+                       layer_type has to be "exconvt" or "cudnn_convt",
+                       otherwise layer_type has to be either "exconv" or
+                       "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1931,7 +2161,7 @@ def img_conv_layer(input,
 
     if layer_type:
         if trans:
-            assert layer_type in ["exconvt"]
+            assert layer_type in ["exconvt", "cudnn_convt"]
         else:
             assert layer_type in ["exconv", "cudnn_conv"]
         lt = layer_type
@@ -1979,7 +2209,8 @@ def img_pool_layer(input,
                    layer_attr=None,
                    pool_size_y=None,
                    stride_y=None,
-                   padding_y=None):
+                   padding_y=None,
+                   ceil_mode=True):
     """
     Image pooling Layer.
 
@@ -1987,6 +2218,34 @@ def img_pool_layer(input,
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
+    - ceil_mode=True:
+
+    ..  math::
+
+        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        maxpool = img_pool_layer(input=conv,
+                                 pool_size=3,
+                                 pool_size_y=5,
+                                 num_channels=8,
+                                 stride=1,
+                                 stride_y=2,
+                                 padding=1,
+                                 padding_y=2,
+                                 pool_type=MaxPooling())
+
     :param padding: pooling padding width.
     :type padding: int
     :param padding_y: pooling padding height. It's equal to padding by default.
@@ -2010,6 +2269,10 @@ def img_pool_layer(input,
     :type stride_y: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
+                      Defalut is True. If set false, Otherwise use floor.
+
+    :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2023,8 +2286,9 @@ def img_pool_layer(input,
         pool_type.name = 'avg'
 
     type_name = pool_type.name + '-projection' \
-      if (isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
-      else pool_type.name
+        if (
+        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        else pool_type.name
 
     pool_size_y = pool_size if pool_size_y is None else pool_size_y
     stride_y = stride if stride_y is None else stride_y
@@ -2047,6 +2311,7 @@ def img_pool_layer(input,
                     stride_y=stride_y,
                     padding_y=padding_y))
         ],
+        ceil_mode=ceil_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2069,6 +2334,15 @@ def spp_layer(input,
     The details please refer to
     `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        spp = spp_layer(input=data,
+                        pyramid_height=2,
+                        num_channels=16,
+                        pool_type=MaxPooling())
+
     :param name: layer name.
     :type name: basestring
     :param input: layer's input.
@@ -2157,6 +2431,12 @@ def img_cmrnorm_layer(input,
     The details please refer to
     `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        norm = img_cmrnorm_layer(input=net, size=5)
+
     :param name: layer name.
     :type name: None|basestring
     :param input: layer's input.
@@ -2212,6 +2492,12 @@ def batch_norm_layer(input,
     The details of batch normalization please refer to this
     `paper <http://arxiv.org/abs/1502.03167>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        norm = batch_norm_layer(input=net, act=ReluActivation())
+
     :param name: layer name.
     :type name: basestring
     :param input: batch normalization input. Better be linear activation.
@@ -2501,8 +2787,67 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
         size=sz)
 
 
+@wrap_name_default("seqconcat")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
+                     bias_attr=None):
+    """
+    Concat sequence a with sequence b.
+
+    Inputs:
+      - a = [a1, a2, ..., an]
+      - b = [b1, b2, ..., bn]
+      - Note that the length of a and b should be the same.
+
+    Output: [a1, b1, a2, b2, ..., an, bn]
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        concat = seq_concat_layer(a=layer1, b=layer2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param a: input sequence layer
+    :type a: LayerOutput
+    :param b: input sequence layer
+    :type b: LayerOutput
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    assert a.size == b.size
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_CONCAT_LAYER,
+        inputs=[a.name, b.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(
+        name,
+        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
+        parents=[a, b],
+        activation=act,
+        size=a.size)
+
+
+@wrap_name_default("memory", "memory_name")
 def memory(name,
            size,
+           memory_name=None,
            is_seq=False,
            boot_layer=None,
            boot_bias=None,
@@ -2524,14 +2869,32 @@ def memory(name,
     If boot_layer is not null, the memory is just the boot_layer's output.
     Set :code:`is_seq` is true boot layer is sequence.
 
-
     The same name layer in recurrent group will set memory on each time
     step.
 
-    :param name: memory's name.
+    .. code-block:: python
+
+       mem = memory(size=256, name='state')
+       state = fc_layer(input=mem, size=256, name='state')
+
+    If you do not want to specify the name, you can equivalently use set_input()
+    to specify the layer needs to be remembered as the following:
+
+    .. code-block:: python
+       mem = memory(size=256)
+       state = fc_layer(input=mem, size=256)
+       mem.set_input(mem)
+
+
+    :param name: the name of the layer which this memory remembers.
+                 If name is None, user should call set_input() to specify the
+                 name of the layer which this memory remembers.
     :type name: basestring
     :param size: size of memory.
     :type size: int
+    :param memory_name: the name of the memory.
+                        It is ignored when name is provided.
+    :type memory_name: basestring
     :param is_seq: is sequence for boot_layer
     :type is_seq: bool
     :param boot_layer: boot layer of memory.
@@ -2553,13 +2916,21 @@ def memory(name,
         boot_bias = ParamAttr.to_bias(boot_bias)
 
     assert boot_layer is None or isinstance(boot_layer, LayerOutput)
+    if name is not None:
+        memory_name = None
 
-    agent_name = Memory(name, size, is_seq, boot_layer.name
-                        if boot_layer is not None else None, boot_bias,
-                        boot_bias_active_type.name, boot_with_const_id)
+    memory_name = Memory(
+        name,
+        size,
+        is_sequence=is_seq,
+        boot_layer=boot_layer.name if boot_layer is not None else None,
+        boot_bias=boot_bias,
+        boot_bias_active_type=boot_bias_active_type.name,
+        boot_with_const_id=boot_with_const_id,
+        memory_name=memory_name)
 
     lout = LayerOutput(
-        name=agent_name,
+        name=memory_name,
         size=size,
         layer_type=LayerType.MEMORY,
         parents=[boot_layer] if boot_layer is not None else None)
@@ -2661,6 +3032,7 @@ def lstm_step_layer(input,
 
 
 @wrap_bias_attr_default()
+@wrap_param_attr_default()
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('gru_step')
@@ -2672,6 +3044,7 @@ def gru_step_layer(input,
                    name=None,
                    gate_act=None,
                    bias_attr=None,
+                   param_attr=None,
                    layer_attr=None):
     """
 
@@ -2683,6 +3056,8 @@ def gru_step_layer(input,
     :param name:
     :param gate_act:
     :param bias_attr:
+    :param param_attr: the parameter_attribute for transforming the output_mem
+                       from previous step.
     :param layer_attr:
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2693,7 +3068,12 @@ def gru_step_layer(input,
     Layer(
         name=name,
         type=LayerType.GRU_STEP_LAYER,
-        inputs=[input.name, output_mem.name],
+        # The parameter here is for transforming the output_mem. The input has
+        # already been transformed outside this module so it does not need
+        # parameter associated with it.
+        # The parameter here is instead grouped with input is due to
+        # backward model compatibility.
+        inputs=[Input(input.name, **param_attr.attr), output_mem.name],
         bias=ParamAttr.to_bias(bias_attr),
         size=size,
         active_type=act.name,
@@ -2707,6 +3087,78 @@ def gru_step_layer(input,
         activation=act)
 
 
+@wrap_bias_attr_default()
+@wrap_param_attr_default()
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(act=TanhActivation())
+@wrap_name_default('gru_step_naive')
+@layer_support(ERROR_CLIPPING, DROPOUT)
+def gru_step_naive_layer(input,
+                         output_mem,
+                         size=None,
+                         name=None,
+                         act=None,
+                         gate_act=None,
+                         bias_attr=None,
+                         param_attr=None,
+                         layer_attr=None):
+    """
+    GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
+    and DROPOUT.
+
+    :param input:
+    :param output_mem:
+    :param size:
+    :param name:
+    :param act:
+    :param gate_act:
+    :param bias_attr:
+    :param param_attr:
+    :param layer_attr:
+    :return:
+    """
+    if input.size % 3 != 0:
+        raise ValueError("GruStep input size must be divided by 3")
+    if size is None:
+        size = input.size / 3
+
+    def __gate__(gate_name, offset):
+        with mixed_layer(
+                name=name + "_" + gate_name,
+                size=size,
+                layer_attr=layer_attr,
+                bias_attr=bias_attr,
+                act=gate_act) as gate:
+            gate += identity_projection(input=input, offset=offset)
+            gate += full_matrix_projection(
+                input=output_mem, param_attr=param_attr)
+        return gate
+
+    update_gate = __gate__("update", 0)
+    reset_gate = __gate__("reset", size)
+
+    with mixed_layer(
+            name=name + "_reset_output", bias_attr=False) as reset_output:
+        reset_output += dotmul_operator(a=output_mem, b=reset_gate)
+
+    with mixed_layer(
+            name=name + "_output_candidate",
+            size=size,
+            layer_attr=layer_attr,
+            bias_attr=bias_attr,
+            act=act) as output_candidate:
+        output_candidate += identity_projection(input=input, offset=2 * size)
+        output_candidate += full_matrix_projection(
+            input=reset_output, param_attr=param_attr)
+
+    with mixed_layer(name=name) as output:
+        output += identity_projection(output_mem)
+        output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
+        output += dotmul_operator(a=output_candidate, b=update_gate)
+
+    return output
+
+
 @wrap_name_default()
 @layer_support()
 def get_output_layer(input, arg_name, name=None, layer_attr=None):
@@ -2948,8 +3400,8 @@ def recurrent_group(step,
 
     assert (targetInlink == None or targetInlink_in_inlinks())
     targetInlinkName = None if targetInlink == None \
-                            else targetInlink.name if isinstance(targetInlink, LayerOutput) \
-                                                   else targetInlink.input.name
+        else targetInlink.name if isinstance(targetInlink, LayerOutput) \
+        else targetInlink.input.name
 
     contains_sub_seq = [False]
 
@@ -3184,9 +3636,15 @@ def beam_search(step,
                 simple_rnn += last_time_step_output
             return simple_rnn
 
+        generated_word_embedding = GeneratedInput(
+                               size=target_dictionary_dim,
+                               embedding_name="target_language_embedding",
+                               embedding_size=word_vector_dim)
+
         beam_gen = beam_search(name="decoder",
                                step=rnn_step,
-                               input=[StaticInput(encoder_last)],
+                               input=[StaticInput(encoder_last),
+                                      generated_word_embedding],
                                bos_id=0,
                                eos_id=1,
                                beam_size=5)
@@ -3205,7 +3663,8 @@ def beam_search(step,
                  You can refer to the first parameter of recurrent_group, or
                  demo/seqToseq/seqToseq_net.py for more details.
     :type step: callable
-    :param input: Input data for the recurrent unit
+    :param input: Input data for the recurrent unit, which should include the
+                  previously generated words as a GeneratedInput object.
     :type input: list
     :param bos_id: Index of the start symbol in the dictionary. The start symbol
                    is a special token for NLP task, which indicates the
@@ -3298,7 +3757,7 @@ def __cost_input__(input, label, weight=None):
     ipts = [Input(input.name), Input(label.name)]
     parents = [input, label]
     if weight is not None:
-        assert weight.layer_type == LayerType.DATA
+        assert weight.size == 1
         ipts.append(Input(weight.name))
         parents.append(weight)
     return ipts, parents
@@ -3306,11 +3765,13 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def regression_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     """
-    Regression Layer.
+    mean squared error cost:
+
+    ..  math::
 
-    TODO(yuyang18): Complete this method.
+        \frac{1}{N}\sum_{i=1}^N(t_i-y_i)^2
 
     :param name: layer name.
     :type name: basestring
@@ -3336,12 +3797,16 @@ def regression_cost(input, label, weight=None, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
+regression_cost = mse_cost
+
+
 @wrap_name_default("cost")
 @layer_support()
 def classification_cost(input,
                         label,
                         weight=None,
                         name=None,
+                        top_k=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -3356,6 +3821,8 @@ def classification_cost(input,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3383,7 +3850,7 @@ def classification_cost(input,
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label, weight=weight)
+        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -3403,7 +3870,8 @@ def conv_operator(img,
                   padding=0,
                   filter_size_y=None,
                   stride_y=None,
-                  padding_y=None):
+                  padding_y=None,
+                  trans=False):
     """
     Different from img_conv_layer, conv_op is an Operator, which can be used
     in mixed_layer. And conv_op takes two inputs to perform convolution.
@@ -3459,7 +3927,9 @@ def conv_operator(img,
     if filter.size is not None:
         filter.size = filter_size * filter_size_y * num_filters * num_channels
 
-    op = ConvOperator(
+    opCls = ConvTransOperator if trans else ConvOperator
+
+    op = opCls(
         input_layer_names=[img.name, filter.name],
         num_filters=num_filters,
         conv_conf=Conv(
@@ -3471,6 +3941,7 @@ def conv_operator(img,
             padding_y=padding_y,
             stride_y=stride_y,
             groups=1))
+
     op.origin = [img, filter]
     return op
 
@@ -3486,11 +3957,9 @@ def conv_projection(input,
                     stride_y=None,
                     padding_y=None,
                     groups=1,
-                    param_attr=None):
+                    param_attr=None,
+                    trans=False):
     """
-    ConvProjection with a layer as input.
-    It performs element-wise multiplication with weight.
-
     Different from img_conv_layer and conv_op, conv_projection is an Projection,
     which can be used in mixed_layer and conat_layer. It use cudnn to implement
     conv and only support GPU mode.
@@ -3499,7 +3968,7 @@ def conv_projection(input,
 
     .. code-block:: python
 
-       proj = conv_projection(img=input1,
+       proj = conv_projection(input=input1,
                               filter_size=3,
                               num_filters=64,
                               num_channels=64)
@@ -3528,6 +3997,8 @@ def conv_projection(input,
     :type groups: int
     :param param_attr: Convolution param attribute. None means default attribute
     :type param_attr: ParameterAttribute
+    :param trans: whether it is convTrans or conv
+    :type trans: boolean
     :return: A DotMulProjection Object.
     :rtype: DotMulProjection
     """
@@ -3564,7 +4035,9 @@ def conv_projection(input,
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
 
-    proj = ConvProjection(
+    projCls = ConvTransProjection if trans else ConvProjection
+
+    proj = projCls(
         input_layer_name=input.name,
         num_filters=num_filters,
         conv_conf=Conv(
@@ -3582,6 +4055,110 @@ def conv_projection(input,
     return proj
 
 
+@wrap_name_default("pad")
+@layer_support()
+def pad_layer(input,
+              pad_c=None,
+              pad_h=None,
+              pad_w=None,
+              name=None,
+              layer_attr=None):
+    """
+    This operation pads zeros to the input data according to pad_c,pad_h
+    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
+    of padding. And the input data shape is NCHW.
+
+    For example, pad_c=[2,3] means padding 2 zeros before the
+    input data and 3 zeros after the input data in channel dimension.
+    pad_h means padding zeros in height dimension. pad_w means padding zeros
+    in width dimension.
+
+    For example,
+
+    .. code-block:: python
+
+       input(2,2,2,3)  = [
+                           [ [[1,2,3], [3,4,5]],
+                             [[2,3,5], [1,6,7]] ],
+                           [ [[4,3,1], [1,8,7]],
+                             [[3,8,9], [2,3,5]] ]
+                         ]
+
+       pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
+
+       output(2,4,2,3) = [
+                           [ [[0,0,0], [0,0,0]],
+                             [[1,2,3], [3,4,5]],
+                             [[2,3,5], [1,6,7]],
+                             [[0,0,0], [0,0,0]] ],
+                           [ [[0,0,0], [0,0,0]],
+                             [[4,3,1], [1,8,7]],
+                             [[3,8,9], [2,3,5]],
+                             [[0,0,0], [0,0,0]] ]
+                         ]
+
+    The simply usage is:
+
+    .. code-block:: python
+
+       pad = pad_layer(input=ipt,
+                       pad_c=[4,4],
+                       pad_h=[0,0],
+                       pad_w=[2,2])
+
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param pad_c: padding size in channel dimension.
+    :type pad_c: list|None
+    :param pad_h: padding size in height dimension.
+    :type pad_h: list|None
+    :param pad_w: padding size in width dimension.
+    :type pad_w: list|None
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param name: layer name.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if pad_c is not None:
+        assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
+    else:
+        pad_c = [0, 0]
+
+    if pad_h is not None:
+        assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
+    else:
+        pad_h = [0, 0]
+
+    if pad_w is not None:
+        assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
+    else:
+        pad_w = [0, 0]
+
+    assert input.num_filters is not None
+    in_ch = input.num_filters
+    out_ch = in_ch + pad_c[0] + pad_c[1]
+
+    l = Layer(
+        name=name,
+        type=LayerType.PAD_LAYER,
+        inputs=Input(
+            input.name,
+            pad=Pad(
+                channels=in_ch,
+                pad_c=pad_c,
+                pad_h=pad_h,
+                pad_w=pad_w, )),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        layer_type=LayerType.PAD_LAYER,
+        parents=[input],
+        num_filters=out_ch,
+        size=l.config.size)
+
+
 @wrap_name_default()
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
@@ -3605,13 +4182,13 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       conv_shift = conv_shift_layer(input=[layer1, layer2])
+       conv_shift = conv_shift_layer(a=layer1, b=layer2)
 
     :param name: layer name
     :type name: basestring
     :param a: Input layer a.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: input layer b.
     :type b: LayerOutput
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3703,8 +4280,8 @@ def tensor_layer(a,
 @wrap_act_default()
 @layer_support()
 def selective_fc_layer(input,
-                       select,
                        size,
+                       select=None,
                        act=None,
                        name=None,
                        pass_generation=False,
@@ -3731,6 +4308,7 @@ def selective_fc_layer(input,
     :type input: LayerOutput|list|tuple
     :param select: The select layer. The output of select layer should be a
                    sparse binary matrix, and treat as the mask of selective fc.
+                   If is None, acts exactly like fc_layer.
     :type select: LayerOutput
     :param size: The layer dimension.
     :type size: int
@@ -3959,7 +4537,7 @@ def block_expand_layer(input,
 
     .. code-block:: python
 
-       block_expand = block_expand_layer(input,
+       block_expand = block_expand_layer(input=layer,
                                          num_channels=128,
                                          stride_x=1,
                                          stride_y=1,
@@ -4013,13 +4591,7 @@ def block_expand_layer(input,
 
 @wrap_name_default()
 @layer_support()
-def maxout_layer(input,
-                 groups,
-                 num_channels=None,
-                 size_x=None,
-                 size_y=None,
-                 name=None,
-                 layer_attr=None):
+def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     """
     A layer to do max out on conv layer output.
       - Input: output of a conv layer.
@@ -4049,12 +4621,6 @@ def maxout_layer(input,
     :type num_channels: int|None
     :param groups: The group number of input layer.
     :type groups: int
-    :param size_x: conv output width. If None will be set
-                   automatically from previous output.
-    :type size_x: int|None
-    :param size_y: conv output height. If None will be set
-                   automatically from previous output.
-    :type size_y: int|None
     :param name: The name of this layer, which can not specify.
     :type name: None|basestring.
     :param layer_attr: Extra Layer attribute.
@@ -4175,7 +4741,7 @@ def warp_ctc_layer(input,
         - You can set 'blank' to any value ranged in [0, num_classes], which
           should be consistent as that used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
-         'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected instead in the 'input' layer.
 
     The simple usage:
 
@@ -4308,6 +4874,13 @@ def crf_decoding_layer(input,
     this layer will also calculate error. output.value[i] is 1 for incorrect
     decoding or 0 for correct decoding.
 
+    The simple usage:
+
+    .. code-block:: python
+
+      crf_decoding = crf_decoding_layer(input=input,
+                                        size=label_dim)
+
     :param input: The first input layer.
     :type input: LayerOutput
     :param size: size of this layer.
@@ -4346,12 +4919,16 @@ def crf_decoding_layer(input,
     return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=1)
 
 
+@wrap_act_default(act=SigmoidActivation())
 @wrap_bias_attr_default(has_bias=True)
+@wrap_param_attr_default()
 @wrap_name_default()
 @layer_support()
 def nce_layer(input,
               label,
-              num_classes,
+              num_classes=None,
+              act=None,
+              param_attr=None,
               weight=None,
               num_neg_samples=10,
               neg_distribution=None,
@@ -4367,7 +4944,8 @@ def nce_layer(input,
 
     .. code-block:: python
 
-       cost = nce_layer(input=layer1, label=layer2, weight=layer3,
+       cost = nce_layer(input=[layer1, layer2], label=layer2,
+                        param_attr=[attr1, attr2], weight=layer3,
                         num_classes=3, neg_distribution=[0.1,0.3,0.6])
 
     :param name: layer name
@@ -4380,6 +4958,10 @@ def nce_layer(input,
     :type weight: LayerOutput
     :param num_classes: number of classes.
     :type num_classes: int
+    :param act: Activation, default is Sigmoid.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
     :param num_neg_samples: number of negative samples. Default is 10.
     :type num_neg_samples: int
     :param neg_distribution: The distribution for generating the random negative labels.
@@ -4395,19 +4977,32 @@ def nce_layer(input,
     """
     if isinstance(input, LayerOutput):
         input = [input]
+        assert not isinstance(param_attr, collections.Sequence)
+        param_attr = [param_attr]
+    else:
+        if isinstance(param_attr, collections.Sequence):
+            assert len(input) == len(param_attr)
+        else:
+            param_attr = [copy.deepcopy(param_attr) for _ in range(len(input))]
+
     assert isinstance(input, collections.Sequence)
+
     assert isinstance(label, LayerOutput)
     assert label.layer_type == LayerType.DATA
+    if num_classes is None:
+        num_classes = label.size
     if neg_distribution is not None:
         assert isinstance(neg_distribution, collections.Sequence)
         assert len(neg_distribution) == num_classes
-        assert sum(neg_distribution) == 1
+        assert abs(sum(neg_distribution) - 1.0) < 1e-5
+    if not isinstance(act, BaseActivation):
+        raise TypeError()
 
     ipts_for_layer = []
     parents = []
-    for each_input in input:
+    for each_input, attr in zip(input, param_attr):
         assert isinstance(each_input, LayerOutput)
-        ipts_for_layer.append(each_input.name)
+        ipts_for_layer.append(Input(each_input.name, **attr.attr))
         parents.append(each_input)
     ipts_for_layer.append(label.name)
     parents.append(label)
@@ -4423,12 +5018,17 @@ def nce_layer(input,
         type=LayerType.NCE_LAYER,
         num_classes=num_classes,
         neg_sampling_dist=neg_distribution,
+        active_type=act.name,
         num_neg_samples=num_neg_samples,
         inputs=ipts_for_layer,
         bias=ParamAttr.to_bias(bias_attr),
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
-        name, LayerType.NCE_LAYER, parents=parents, size=l.config.size)
+        name,
+        LayerType.NCE_LAYER,
+        parents=parents,
+        size=l.config.size,
+        activation=act)
 
 
 """
@@ -4573,7 +5173,12 @@ def lambda_cost(input,
 
 @wrap_name_default()
 @layer_support()
-def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
+def cross_entropy(input,
+                  label,
+                  name=None,
+                  coeff=1.0,
+                  weight=None,
+                  layer_attr=None):
     """
     A loss layer for multi class entropy.
 
@@ -4588,22 +5193,27 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
     :type input: LayerOutput.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The cost is multiplied with coeff.
+                  The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param weight: The cost of each sample is multiplied with each weight.
+                   The weight should be a layer with size=1. Note that gradient
+                   will not be calculated for weight.
+    :type weight: LayerOutout
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
 
+    ipts, parents = __cost_input__(input, label, weight)
     Layer(
         name=name,
         type=LayerType.CROSS_ENTROPY,
-        inputs=[input.name, label.name],
+        inputs=ipts,
         coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.CROSS_ENTROPY, parents=[input, label], size=1)
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
 
 
 @wrap_name_default()
@@ -4616,6 +5226,7 @@ def cross_entropy_with_selfnorm(input,
                                 layer_attr=None):
     """
     A loss layer for multi class entropy with selfnorm.
+    Input should be a vector of positive numbers, without normalization.
 
     .. code-block:: python
 
@@ -4736,8 +5347,6 @@ def multi_binary_label_cross_entropy(input,
     :type input: LayerOutput
     :param label: The input label.
     :type input: LayerOutput
-    :param type: The type of cost.
-    :type type: basestring
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring
     :param coeff: The coefficient affects the gradient in the backward.
@@ -4766,3 +5375,52 @@ def multi_binary_label_cross_entropy(input,
         LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
         parents=[input, label],
         size=1)
+
+
+@wrap_name_default()
+@layer_support()
+def smooth_l1_cost(input, label, name=None, layer_attr=None):
+    """
+    This is a L1 loss but more smooth. It requires that the
+    size of input and label are equal. The formula is as follows,
+
+    .. math::
+
+        L = \sum_{i} smooth_{L1}(input_i - label_i)
+
+    in which
+
+    .. math::
+
+        smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
+
+    More details can be found by referring to `Fast R-CNN
+    <https://arxiv.org/pdf/1504.08083v2.pdf>`_
+
+    .. code-block:: python
+
+       cost = smooth_l1_cost(input=input_layer,
+                             label=label_layer)
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The input label.
+    :type input: LayerOutput
+    :param name: The name of this layers. It is not necessary.
+    :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    assert input.size == label.size
+
+    Layer(
+        name=name,
+        type=LayerType.SMOOTH_L1,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.SMOOTH_L1, parents=[input, label], size=1)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
old mode 100644
new mode 100755
index 375bea34e8aa0ac2ea222531f313a627414495b0..fb533a47e0b0585be6f0e019086993f8b3aa7f38
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -737,12 +737,12 @@ def lstmemory_group(input,
                     lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    lstm_group is a recurrent layer group version Long Short Term Memory. It
+    lstm_group is a recurrent layer group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
     layers.py for the maths) does. A promising benefit is that LSTM memory
-    cell states, or hidden states in every time step are accessible to for the
+    cell states, or hidden states in every time step are accessible to the
     user. This is especially useful in attention model. If you do not need to
-    access to the internal states of the lstm, but merely use its outputs,
+    access the internal states of the lstm, but merely use its outputs,
     it is recommended to use the lstmemory, which is relatively faster than
     lstmemory_group.
 
@@ -822,9 +822,11 @@ def gru_unit(input,
              size=None,
              name=None,
              gru_bias_attr=None,
+             gru_param_attr=None,
              act=None,
              gate_act=None,
-             gru_layer_attr=None):
+             gru_layer_attr=None,
+             naive=False):
     """
     Define calculations that a gated recurrent unit performs in a single time
     step. This function itself is not a recurrent layer, so that it can not be
@@ -856,12 +858,18 @@ def gru_unit(input,
 
     out_mem = memory(name=name, size=size)
 
-    gru_out = gru_step_layer(
+    if naive:
+        __step__ = gru_step_naive_layer
+    else:
+        __step__ = gru_step_layer
+
+    gru_out = __step__(
         name=name,
         input=input,
         output_mem=out_mem,
         size=size,
         bias_attr=gru_bias_attr,
+        param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
         layer_attr=gru_layer_attr)
@@ -874,15 +882,17 @@ def gru_group(input,
               name=None,
               reverse=False,
               gru_bias_attr=None,
+              gru_param_attr=None,
               act=None,
               gate_act=None,
-              gru_layer_attr=None):
+              gru_layer_attr=None,
+              naive=False):
     """
-    gru_group is a recurrent layer group version Gated Recurrent Unit. It
+    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
-    benefit is that gru hidden sates are accessible to for the user. This is
-    especially useful in attention model. If you do not need to access to
-    any internal state, but merely use the outputs of a GRU, it is recommanded
+    benefit is that gru hidden states are accessible to the user. This is
+    especially useful in attention model. If you do not need to access
+    any internal state, but merely use the outputs of a GRU, it is recommended
     to use the grumemory, which is relatively faster.
 
     Please see grumemory in layers.py for more detail about the maths.
@@ -922,9 +932,11 @@ def gru_group(input,
             name=name,
             size=size,
             gru_bias_attr=gru_bias_attr,
+            gru_param_attr=gru_param_attr,
             act=act,
             gate_act=gate_act,
-            gru_layer_attr=gru_layer_attr)
+            gru_layer_attr=gru_layer_attr,
+            naive=naive)
 
     return recurrent_group(
         name='%s_recurrent_group' % name,
@@ -942,9 +954,11 @@ def simple_gru(input,
                mixed_bias_param_attr=None,
                mixed_layer_attr=None,
                gru_bias_attr=None,
+               gru_param_attr=None,
                act=None,
                gate_act=None,
-               gru_layer_attr=None):
+               gru_layer_attr=None,
+               naive=False):
     """
     You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
     simple_gru in network.py. The reason why there are so many interfaces is
@@ -952,22 +966,22 @@ def simple_gru(input,
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
     with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
     the multiplication operation :math:`W x_t` is not computed in these layers.
-    See details in their interfaces in layers.py. 
+    See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
     series of layers to compute rnn step by step. This way is flexible for
     attenion mechanism or other complex connections.
 
     - gru_step_layer: only compute rnn by one step. It needs an memory as input
       and can be used in recurrent group.
-    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_unit: a wrapper of gru_step_layer with memory.
     - gru_group: a GRU cell implemented by a combination of multiple layers in
       recurrent group.
-      But :math:`W x_t` is not done in group.  
+      But :math:`W x_t` is not done in group.
     - gru_memory: a GRU cell implemented by one layer, which does same calculation
-      with gru_group and is faster than gru_group. 
-    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      with gru_group and is faster than gru_group.
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
       gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
-      formula in grumemory. 
+      formula in grumemory.
 
     The computational speed is that, grumemory is relatively better than
     gru_group, and gru_group is relatively better than simple_gru.
@@ -1010,9 +1024,11 @@ def simple_gru(input,
         input=m,
         reverse=reverse,
         gru_bias_attr=gru_bias_attr,
+        gru_param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
-        gru_layer_attr=gru_layer_attr)
+        gru_layer_attr=gru_layer_attr,
+        naive=naive)
 
 
 @wrap_name_default('simple_gru2')
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 403aafabe9143472dd2f0857ecd25f7acf515b6c..6c860fd49702ebc93612114011361efb885c62ec 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -9,17 +9,9 @@ add_test(NAME test_reset_hook
         ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-if (PROTOBUF_3)
-  add_paddle_exe(protobuf_equal
-    ProtobufEqualMain.cpp)
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-  )
-else()
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-  )
-endif()
+add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 3f1d99701afe5425553feb129c7619b5e3e689fa..c5dc8e1aab08d38936d8636c219571d0cf6f4906 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -3,7 +3,8 @@ export configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
+test_seq_concat_reshape test_pad test_smooth_l1)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index ee5961af75ebb33af52f9add645f793015288f4e..8a318879630cd491573afcaf798dda2ca75e335d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -11,6 +11,9 @@ for conf in ${configs[*]}
 do
     echo "Generating " $conf
     $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    if [ ! -f "$protostr/$conf.protostr" ]; then 
+        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
+    fi
     cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
@@ -18,5 +21,8 @@ for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
     $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    if [ ! -f "$protostr/$conf.protostr" ]; then 
+        cp $protostr/$conf.protostr.unittest $protostr/$conf.protostr
+    fi
     cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
index 3a1a0132b64bb928e857905f99c0be2e81ccbda2..3c6dbc95e54898ca1e44c3dc010c9fb73a3bee30 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
@@ -14,4 +14,7 @@ for op in seq_op:
     for al in agg_level:
         opts.append(op(input=din, agg_level=al))
 
+for op in seq_op:
+    opts.append(op(input=din, agg_level=AggregateLevel.EACH_TIMESTEP, stride=5))
+
 outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index 3331c10d6497f58eb135208bd7abe48aacfb10ae..24c901c8ee3ab1c90fc14fbff761db06345a6313 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -7,8 +7,9 @@ x = layer_math.exp(x)
 x = layer_math.log(x)
 x = layer_math.abs(x)
 x = layer_math.sigmoid(x)
+x = layer_math.tanh(x)
 x = layer_math.square(x)
-x = layer_math.square(x)
+x = layer_math.relu(x)
 y = 1 + x
 y = y + 1
 y = x + y
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
index aa4521dcd5db3f845871cfaaedb02a86bcbddc38..dc8975cb311582a621eb4a5a166ddc34348fe3e9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -34,11 +34,31 @@ flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
 with mixed_layer() as m7:
     m7 += conv_operator(
         img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
+    m7 += conv_projection(img, filter_size=3, num_filters=64, num_channels=1)
 
+with mixed_layer() as m8:
+    m8 += conv_operator(
+        img=img,
+        filter=flt,
+        num_filters=64,
+        num_channels=1,
+        filter_size=3,
+        stride=2,
+        padding=1,
+        trans=True)
+    m8 += conv_projection(
+        img,
+        filter_size=3,
+        num_filters=64,
+        num_channels=1,
+        stride=2,
+        padding=1,
+        trans=True)
 end = mixed_layer(
     input=[
         full_matrix_projection(input=m5),
-        trans_full_matrix_projection(input=m6), full_matrix_projection(input=m7)
+        trans_full_matrix_projection(input=m6),
+        full_matrix_projection(input=m7), full_matrix_projection(input=m8)
     ],
     size=100,
     layer_attr=ExtraAttr(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index 6934fd0da62f90f9bbddef9a98798bf168f7fa8e..2818389b16cca75f5030b75fc4de8c89c06c5e02 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -33,6 +33,8 @@ layers {
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 64
   shared_biases: true
+  height: 256
+  width: 256
 }
 layers {
   name: "__batch_norm_0__"
@@ -58,6 +60,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
index 7b2911f8e367ebf9d0797e815a7532c714ef698e..12b2255f3a41119792d0f993ce2e03ce9ee3e994 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -15,6 +15,7 @@ layers {
   }
   select_first: true
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_1__"
@@ -26,6 +27,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_0__"
@@ -36,6 +38,7 @@ layers {
     input_layer_name: "data"
   }
   trans_type: "seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -46,12 +49,38 @@ layers {
     input_layer_name: "data"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
+}
+layers {
+  name: "__first_seq_2__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  select_first: true
+  trans_type: "non-seq"
+  seq_pool_stride: 5
+}
+layers {
+  name: "__last_seq_2__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  trans_type: "non-seq"
+  seq_pool_stride: 5
 }
 input_layer_names: "data"
 output_layer_names: "__first_seq_0__"
 output_layer_names: "__first_seq_1__"
 output_layer_names: "__last_seq_0__"
 output_layer_names: "__last_seq_1__"
+output_layer_names: "__first_seq_2__"
+output_layer_names: "__last_seq_2__"
 sub_models {
   name: "root"
   layer_names: "data"
@@ -59,11 +88,15 @@ sub_models {
   layer_names: "__first_seq_1__"
   layer_names: "__last_seq_0__"
   layer_names: "__last_seq_1__"
+  layer_names: "__first_seq_2__"
+  layer_names: "__last_seq_2__"
   input_layer_names: "data"
   output_layer_names: "__first_seq_0__"
   output_layer_names: "__first_seq_1__"
   output_layer_names: "__last_seq_0__"
   output_layer_names: "__last_seq_1__"
+  output_layer_names: "__first_seq_2__"
+  output_layer_names: "__last_seq_2__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
index da8da1b541f37a09654202f68232b99e4dac9f61..9b8a2ad9687d313e6c5017c2d7331eddf539af92 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -65,13 +65,28 @@ layers {
     }
   }
 }
+layers {
+  name: "__tanh_0__"
+  type: "mixed"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__sigmoid_0__"
+    proj_conf {
+      type: "identity"
+      name: "___tanh_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
 layers {
   name: "__square_0__"
   type: "mixed"
   size: 100
   active_type: "square"
   inputs {
-    input_layer_name: "__sigmoid_0__"
+    input_layer_name: "__tanh_0__"
     proj_conf {
       type: "identity"
       name: "___square_0__.w0"
@@ -81,15 +96,15 @@ layers {
   }
 }
 layers {
-  name: "__square_1__"
+  name: "__relu_0__"
   type: "mixed"
   size: 100
-  active_type: "square"
+  active_type: "relu"
   inputs {
     input_layer_name: "__square_0__"
     proj_conf {
       type: "identity"
-      name: "___square_1__.w0"
+      name: "___relu_0__.w0"
       input_size: 100
       output_size: 100
     }
@@ -101,7 +116,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
   }
   slope: 1.0
   intercept: 1
@@ -123,7 +138,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
     proj_conf {
       type: "identity"
       name: "___mixed_0__.w0"
@@ -147,7 +162,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
   }
   slope: -1.0
   intercept: 0.0
@@ -339,8 +354,9 @@ sub_models {
   layer_names: "__log_0__"
   layer_names: "__abs_0__"
   layer_names: "__sigmoid_0__"
+  layer_names: "__tanh_0__"
   layer_names: "__square_0__"
-  layer_names: "__square_1__"
+  layer_names: "__relu_0__"
   layer_names: "__slope_intercept_layer_0__"
   layer_names: "__slope_intercept_layer_1__"
   layer_names: "__mixed_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2943ab130bd7d6f3b78ea611f1c35850ccaf5e92..d8bd7b9dfb71a392d0dc53872a0d72f47530530f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -154,13 +154,40 @@ layers {
   inputs {
     input_layer_name: "img"
   }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_6__.w1"
+    proj_conf {
+      type: "conv"
+      name: "___mixed_6__.w1"
+      input_size: 1024
+      output_size: 57600
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 1
+        padding: 0
+        groups: 1
+        filter_channels: 1
+        output_x: 30
+        img_size: 32
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 0
+        stride_y: 1
+        output_y: 30
+        img_size_y: 32
+      }
+      num_filters: 64
+    }
+  }
   inputs {
     input_layer_name: "filter"
   }
   operator_confs {
     type: "conv"
     input_indices: 0
-    input_indices: 1
+    input_indices: 2
     input_sizes: 1024
     input_sizes: 576
     output_size: 57600
@@ -186,39 +213,114 @@ layers {
 layers {
   name: "__mixed_7__"
   type: "mixed"
+  size: 254016
+  active_type: ""
+  inputs {
+    input_layer_name: "img"
+  }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_7__.w1"
+    proj_conf {
+      type: "convt"
+      name: "___mixed_7__.w1"
+      input_size: 1024
+      output_size: 254016
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 2
+        padding: 1
+        groups: 1
+        filter_channels: 64
+        output_x: 32
+        img_size: 63
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 1
+        stride_y: 2
+        output_y: 32
+        img_size_y: 63
+      }
+      num_filters: 64
+    }
+  }
+  inputs {
+    input_layer_name: "filter"
+  }
+  operator_confs {
+    type: "convt"
+    input_indices: 0
+    input_indices: 2
+    input_sizes: 1024
+    input_sizes: 576
+    output_size: 254016
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 64
+      output_x: 32
+      img_size: 63
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 32
+      img_size_y: 63
+    }
+    num_filters: 64
+  }
+}
+layers {
+  name: "__mixed_8__"
+  type: "mixed"
   size: 100
   active_type: ""
   inputs {
     input_layer_name: "__mixed_4__"
-    input_parameter_name: "___mixed_7__.w0"
+    input_parameter_name: "___mixed_8__.w0"
     proj_conf {
       type: "fc"
-      name: "___mixed_7__.w0"
+      name: "___mixed_8__.w0"
       input_size: 300
       output_size: 100
     }
   }
   inputs {
     input_layer_name: "__mixed_5__"
-    input_parameter_name: "___mixed_7__.w1"
+    input_parameter_name: "___mixed_8__.w1"
     proj_conf {
       type: "trans_fc"
-      name: "___mixed_7__.w1"
+      name: "___mixed_8__.w1"
       input_size: 100
       output_size: 100
     }
   }
   inputs {
     input_layer_name: "__mixed_6__"
-    input_parameter_name: "___mixed_7__.w2"
+    input_parameter_name: "___mixed_8__.w2"
     proj_conf {
       type: "fc"
-      name: "___mixed_7__.w2"
+      name: "___mixed_8__.w2"
       input_size: 57600
       output_size: 100
     }
   }
+  inputs {
+    input_layer_name: "__mixed_7__"
+    input_parameter_name: "___mixed_8__.w3"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_8__.w3"
+      input_size: 254016
+      output_size: 100
+    }
+  }
   drop_rate: 0.5
+  error_clipping_threshold: 40.0
 }
 parameters {
   name: "___embedding_0__.w0"
@@ -281,7 +383,23 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w0"
+  name: "___mixed_6__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_8__.w0"
   size: 30000
   initial_mean: 0.0
   initial_std: 0.057735026919
@@ -291,7 +409,7 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w1"
+  name: "___mixed_8__.w1"
   size: 10000
   initial_mean: 0.0
   initial_std: 0.1
@@ -301,7 +419,7 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w2"
+  name: "___mixed_8__.w2"
   size: 5760000
   initial_mean: 0.0
   initial_std: 0.00416666666667
@@ -310,10 +428,20 @@ parameters {
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___mixed_8__.w3"
+  size: 25401600
+  initial_mean: 0.0
+  initial_std: 0.00198412698413
+  dims: 254016
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
 input_layer_names: "test"
 input_layer_names: "img"
 input_layer_names: "filter"
-output_layer_names: "__mixed_7__"
+output_layer_names: "__mixed_8__"
 sub_models {
   name: "root"
   layer_names: "test"
@@ -328,10 +456,11 @@ sub_models {
   layer_names: "filter"
   layer_names: "__mixed_6__"
   layer_names: "__mixed_7__"
+  layer_names: "__mixed_8__"
   input_layer_names: "test"
   input_layer_names: "img"
   input_layer_names: "filter"
-  output_layer_names: "__mixed_7__"
+  output_layer_names: "__mixed_8__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..64530146a1458933d4ba0edffc1b1b7e60a21187
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -0,0 +1,297 @@
+type: "recurrent_nn"
+layers {
+  name: "data_a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "data_b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0___transform"
+  type: "mixed"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "data_a"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___simple_gru_0___transform.w0"
+      input_size: 100
+      output_size: 600
+    }
+  }
+}
+layers {
+  name: "__simple_gru_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+  type: "scatter_agent"
+  size: 600
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+  type: "gru_step"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+    input_parameter_name: "gru_param"
+  }
+  inputs {
+    input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  }
+  bias_parameter_name: "gru_bias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__simple_gru_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1___transform"
+  type: "mixed"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "data_b"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___simple_gru_1___transform.w0"
+      input_size: 100
+      output_size: 600
+    }
+  }
+}
+layers {
+  name: "__simple_gru_1___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+  type: "scatter_agent"
+  size: 600
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+  type: "gru_step"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+    input_parameter_name: "gru_param"
+  }
+  inputs {
+    input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  }
+  bias_parameter_name: "gru_bias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__simple_gru_1__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__simple_gru_0__"
+  }
+  trans_type: "non-seq"
+  seq_pool_stride: -1
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__simple_gru_1__"
+  }
+  trans_type: "non-seq"
+  seq_pool_stride: -1
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__last_seq_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__last_seq_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "mixed_param"
+  size: 60000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "gru_param"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "gru_bias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 2000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data_a"
+input_layer_names: "data_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "data_a"
+  layer_names: "data_b"
+  layer_names: "__simple_gru_0___transform"
+  layer_names: "__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__"
+  layer_names: "__simple_gru_1___transform"
+  layer_names: "__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "data_a"
+  input_layer_names: "data_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+    link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__simple_gru_0___transform"
+    link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+    link_name: "__simple_gru_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+    link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__simple_gru_1___transform"
+    link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+    link_name: "__simple_gru_1__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
index 0a83499b724806666a241489467207f3c7151a3a..79fa4c74f081aebadd258e06333de9eafe6a5ee3 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -210,6 +210,7 @@ layers {
     input_layer_name: "__lstm_group_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -220,6 +221,7 @@ layers {
     input_layer_name: "__lstm_group_1__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__fc_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
index dacb40185f863025528c2d4eeb8b325425953a93..68fa881b4f1408b8cd20f2417062ce035c0fda54 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -143,6 +143,7 @@ layers {
     input_layer_name: "__recurrent_layer_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_0__"
@@ -154,6 +155,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_1__"
@@ -164,6 +166,7 @@ layers {
     input_layer_name: "__lstmemory_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_1__"
@@ -175,6 +178,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__last_seq_2__"
@@ -185,6 +189,7 @@ layers {
     input_layer_name: "__gru_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__first_seq_2__"
@@ -196,6 +201,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 parameters {
   name: "___fc_layer_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
index 9fae596f281d44dc24c45cb3c750233266e95948..fd5224ca55cd1f642ca2f927f867a7cbf8a47cf6 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bilinear_interp.protostr
@@ -90,8 +90,6 @@ layers {
     input_layer_name: "__pool_0__"
     input_parameter_name: "___fc_layer_0__.w0"
   }
-  height: 32
-  width: 32
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 10e59e21bc7a48bc53fb535f86f053c91f57c1df..05847344be60b4de42a7dd709914fd3da524d1ae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -215,6 +215,22 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "__nce_layer_0__"
+  type: "nce"
+  size: 1
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___nce_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  bias_parameter_name: "___nce_layer_0__.wbias"
+  num_classes: 5000
+  num_neg_samples: 10
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 800
@@ -239,12 +255,32 @@ parameters {
   name: "___crf_layer_0__.w0"
   size: 24
   initial_mean: 0.0
-  initial_std: 0.5
-  dims: 4
+  initial_std: 0.408248290464
   dims: 6
+  dims: 4
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___nce_layer_0__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.0141421356237
+  dims: 5000
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___nce_layer_0__.wbias"
+  size: 5000
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 5000
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "input"
 input_layer_names: "labels"
 input_layer_names: "crf_label"
@@ -267,6 +303,7 @@ output_layer_names: "__cross_entropy_with_selfnorm_0__"
 output_layer_names: "__huber_cost_0__"
 output_layer_names: "__multi_binary_label_cross_entropy_0__"
 output_layer_names: "__sum_cost_0__"
+output_layer_names: "__nce_layer_0__"
 sub_models {
   name: "root"
   layer_names: "input"
@@ -292,6 +329,7 @@ sub_models {
   layer_names: "__huber_cost_0__"
   layer_names: "__multi_binary_label_cross_entropy_0__"
   layer_names: "__sum_cost_0__"
+  layer_names: "__nce_layer_0__"
   input_layer_names: "input"
   input_layer_names: "labels"
   input_layer_names: "crf_label"
@@ -314,6 +352,7 @@ sub_models {
   output_layer_names: "__huber_cost_0__"
   output_layer_names: "__multi_binary_label_cross_entropy_0__"
   output_layer_names: "__sum_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 811b38ae4a51e8faedb59fea2b81a8be3cceeae6..b7d74f85ab4ca3f434dfa45516dfee7227b6ceee 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__regression_cost_0__"
+  name: "__mse_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -60,6 +60,31 @@ layers {
   }
   coeff: 1.0
 }
+layers {
+  name: "multi_class_label"
+  type: "data"
+  size: 500
+  active_type: ""
+}
+layers {
+  name: "__nce_layer_0__"
+  type: "nce"
+  size: 1
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___nce_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "multi_class_label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  bias_parameter_name: "___nce_layer_0__.wbias"
+  num_classes: 500
+  num_neg_samples: 10
+}
 parameters {
   name: "___fc_layer_0__.w0"
   size: 3000
@@ -80,11 +105,32 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
+parameters {
+  name: "___nce_layer_0__.w0"
+  size: 5000
+  initial_mean: 0.0
+  initial_std: 0.04472135955
+  dims: 500
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___nce_layer_0__.wbias"
+  size: 500
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 500
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
+input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
-output_layer_names: "__regression_cost_0__"
+output_layer_names: "__mse_cost_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -99,12 +145,15 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__regression_cost_0__"
+  layer_names: "__mse_cost_0__"
+  layer_names: "multi_class_label"
+  layer_names: "__nce_layer_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
+  input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__regression_cost_0__"
+  output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
index c763a95f9d1aefa022f38e0beef6d1c86ebb360d..03f4f3a31d6c222d949f64341bb8ac4c2a56fc5a 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -153,8 +153,6 @@ layers {
       img_size_y: 0
     }
   }
-  height: 24
-  width: 24
 }
 layers {
   name: "__fc_layer_0__"
@@ -165,8 +163,6 @@ layers {
     input_layer_name: "__block_expand_layer_0__"
     input_parameter_name: "___fc_layer_0__.w0"
   }
-  height: 24
-  width: 24
 }
 parameters {
   name: "___conv_0__.w0"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
index b30bbb2a4e24d74ebe1d6c8eda8be5aa09217f6d..c1bfdf1b19c61d096c25af061c6fbb3bbfc50265 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -79,7 +79,7 @@ layers {
   inputs {
     input_layer_name: "b"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__cos_sim_1__"
@@ -92,7 +92,7 @@ layers {
   inputs {
     input_layer_name: "c"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__sum_to_one_norm_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..15c6ab4dc8e61dedc10acaa49db7d8ae136d4952
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_pad.protostr
@@ -0,0 +1,120 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2016
+  active_type: ""
+  height: 48
+  width: 42
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 32256
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 42
+      img_size: 42
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+      output_y: 48
+      img_size_y: 48
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+  height: 48
+  width: 42
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 8064
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 16
+      size_x: 2
+      stride: 2
+      output_x: 21
+      img_size: 42
+      padding: 0
+      size_y: 2
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      padding_y: 0
+    }
+  }
+  height: 24
+  width: 21
+}
+layers {
+  name: "__pad_0__"
+  type: "pad"
+  size: 14175
+  active_type: ""
+  inputs {
+    input_layer_name: "__pool_0__"
+    pad_conf {
+      image_conf {
+        channels: 16
+        img_size: 21
+        img_size_y: 24
+      }
+      pad_c: 2
+      pad_c: 3
+      pad_h: 1
+      pad_h: 2
+      pad_w: 3
+      pad_w: 1
+    }
+  }
+  height: 27
+  width: 25
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 144
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__pad_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__conv_0__"
+  layer_names: "__pool_0__"
+  layer_names: "__pad_0__"
+  input_layer_names: "data"
+  output_layer_names: "__pad_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 41d2e2f2671f5c05425f9bd2e91d8adc33129761..77b447aa9db2a6c323fd3c322e7e9ca1ed19a6dd 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -96,6 +96,7 @@ layers {
     input_layer_name: "rnn_forward"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__recurrent_group_1__"
@@ -145,6 +146,7 @@ layers {
   }
   select_first: true
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__recurrent_group_2__"
@@ -193,6 +195,7 @@ layers {
     input_layer_name: "rnn_subseq_forward"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__lstm_group_0___recurrent_group"
@@ -282,6 +285,7 @@ layers {
     input_layer_name: "__lstm_group_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 layers {
   name: "__gru_group_0___recurrent_group"
@@ -330,6 +334,56 @@ layers {
     input_layer_name: "__gru_group_0__"
   }
   trans_type: "non-seq"
+  seq_pool_stride: -1
+}
+layers {
+  name: "__recurrent_group_3__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_3__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__memory_6__@__recurrent_group_3__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__@__recurrent_group_3__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  }
+  inputs {
+    input_layer_name: "__memory_6__@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  }
+  bias_parameter_name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_4__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  trans_type: "non-seq"
+  seq_pool_stride: -1
 }
 parameters {
   name: "___mixed_0__.w0"
@@ -465,11 +519,11 @@ parameters {
   name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
   size: 30000
   initial_mean: 0.0
-  initial_std: 0.01
+  initial_std: 0.1
   dims: 100
   dims: 300
   initial_strategy: 0
-  initial_smart: false
+  initial_smart: true
 }
 parameters {
   name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
@@ -481,6 +535,36 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "seq_input"
 input_layer_names: "sub_seq_input"
 output_layer_names: "__last_seq_0__"
@@ -488,6 +572,7 @@ output_layer_names: "__first_seq_0__"
 output_layer_names: "__last_seq_1__"
 output_layer_names: "__last_seq_2__"
 output_layer_names: "__last_seq_3__"
+output_layer_names: "__last_seq_4__"
 sub_models {
   name: "root"
   layer_names: "seq_input"
@@ -510,6 +595,9 @@ sub_models {
   layer_names: "__gru_group_0___recurrent_group"
   layer_names: "__gru_group_0__"
   layer_names: "__last_seq_3__"
+  layer_names: "__recurrent_group_3__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__last_seq_4__"
   input_layer_names: "seq_input"
   input_layer_names: "sub_seq_input"
   output_layer_names: "__last_seq_0__"
@@ -517,6 +605,7 @@ sub_models {
   output_layer_names: "__last_seq_1__"
   output_layer_names: "__last_seq_2__"
   output_layer_names: "__last_seq_3__"
+  output_layer_names: "__last_seq_4__"
   is_recurrent_layer_group: false
 }
 sub_models {
@@ -647,4 +736,28 @@ sub_models {
   }
   target_inlinkid: -1
 }
+sub_models {
+  name: "__recurrent_group_3__"
+  layer_names: "seq_input@__recurrent_group_3__"
+  layer_names: "__memory_6__@__recurrent_group_3__"
+  layer_names: "__fc_layer_0__@__recurrent_group_3__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__memory_6__@__recurrent_group_3__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_3__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__fc_layer_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..91284b4fb32fcfdbf6b9e7384ffe080574b78821
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data1"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data2"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__seqconcat_0__"
+  type: "seqconcat"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data1"
+  }
+  inputs {
+    input_layer_name: "data2"
+  }
+}
+layers {
+  name: "__seqreshape_0__"
+  type: "seqreshape"
+  size: 5
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data1"
+  }
+}
+input_layer_names: "data1"
+input_layer_names: "data2"
+output_layer_names: "__seqconcat_0__"
+output_layer_names: "__seqreshape_0__"
+sub_models {
+  name: "root"
+  layer_names: "data1"
+  layer_names: "data2"
+  layer_names: "__seqconcat_0__"
+  layer_names: "__seqreshape_0__"
+  input_layer_names: "data1"
+  input_layer_names: "data2"
+  output_layer_names: "__seqconcat_0__"
+  output_layer_names: "__seqreshape_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4aa041ea2e173a6cc2ab21e3c9ea703601929cde
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_smooth_l1.protostr
@@ -0,0 +1,40 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__smooth_l1_cost_0__"
+  type: "smooth_l1"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+input_layer_names: "input"
+input_layer_names: "label"
+output_layer_names: "__smooth_l1_cost_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "label"
+  layer_names: "__smooth_l1_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "label"
+  output_layer_names: "__smooth_l1_cost_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
index 1cfb92255aa92fa3fbc16a816851a5c2f81c2b56..569b0b945a762e8b596e197adc06df64e33311af 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
@@ -19,7 +19,7 @@ model_config {
 data_config {
   type: "py2"
   files: "train.list"
-  async_load_data: true
+  async_load_data: false
   for_test: false
   load_data_module: "a"
   load_data_object: "c"
@@ -58,7 +58,7 @@ opt_config {
 test_data_config {
   type: "py2"
   files: "test.list"
-  async_load_data: true
+  async_load_data: false
   for_test: true
   load_data_module: "b"
   load_data_object: "d"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index a37eb6439e6d2803a417883f0aed2a5d56d059b9..c8a3b190b19148ddb701020f5be55c4c29a17079 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -2,16 +2,18 @@
 cd `dirname $0`
 
 set -e
+PYTHON_EXEC=$1
+COMPARE_PROTO_UTIL=$2
 
 protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh $1
+./generate_protostr.sh ${PYTHON_EXEC}
 
 . ./file_list.sh
 
-if [ -z $1 ]; then
+if [ -z ${COMPARE_PROTO_UTIL} ]; then
   for file in $files
   do
       base_protostr=$protostr/$file
@@ -22,20 +24,20 @@ if [ -z $1 ]; then
 else
   for file in ${configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
 
   for file in ${whole_configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19bb9685aa24c4d66e4f0bbbcb004507413dbe8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
@@ -0,0 +1,40 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+
+mixed_param = ParamAttr(name='mixed_param')
+
+gru_param = ParamAttr(name='gru_param')
+gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
+
+gru1 = simple_gru(
+    input=data_1,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+gru2 = simple_gru(
+    input=data_2,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+softmax_param = ParamAttr(name='softmax_param')
+
+predict = fc_layer(
+    input=[last_seq(input=gru1), last_seq(input=gru2)],
+    size=10,
+    param_attr=[softmax_param, softmax_param],
+    bias_attr=False,
+    act=SoftmaxActivation())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
index 18ff6b48c495b7a9d61595916ade1a54b1fa6a10..d2a3b702a1d7b650947b344e4719098f68d4dd73 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -40,4 +40,6 @@ outputs(
             name='huber_label', size=1)),
     multi_binary_label_cross_entropy(
         input=probs, label=xe_label),
-    sum_cost(input=hidden))
+    sum_cost(input=hidden),
+    nce_layer(
+        input=hidden, label=labels))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index d30f70a55c5b1834074966dfb3f378e01447c8ab..c369062930e2b067ceab0dc3b25ba6c1eabe2450 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -10,5 +10,10 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    regression_cost(
-        input=fc, label=lbl, weight=wt))
+    mse_cost(
+        input=fc, label=lbl, weight=wt),
+    nce_layer(
+        input=fc,
+        label=data_layer(
+            name='multi_class_label', size=500),
+        weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..491e8c8caab38eb7c24e5461107ab5a9d63b12ef
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -0,0 +1,20 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2016, height=48, width=42)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(input=conv, pool_size=2, stride=2, pool_type=MaxPooling())
+
+pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+
+outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
index 60b4849d69d497109ef5af3257e212df233a2d0b..91010759e4847f087eb4e05ad98ae794a2129365 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -16,6 +16,16 @@ def generate_rnn_simple(name):
     return rnn_simple
 
 
+def generate_rnn_simple_no_name():
+    def rnn_simple(s):
+        m = memory(name=None, size=200)
+        fc = fc_layer(input=[s, m], size=200)
+        m.set_input(fc)
+        return fc
+
+    return rnn_simple
+
+
 with mixed_layer() as lstm_param:  # test lstm unit, rnn group
     lstm_param += full_matrix_projection(input=seq, size=100 * 4)
 
@@ -33,4 +43,6 @@ outputs(
     last_seq(input=lstmemory_group(
         input=lstm_param, size=100)),
     last_seq(input=gru_group(
-        input=gru_param, size=100)))
+        input=gru_param, size=100)),
+    last_seq(input=recurrent_group(
+        step=generate_rnn_simple_no_name(), input=seq)), )
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c161ba805fb301e8feb8702ad61a8341df40e3f
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
@@ -0,0 +1,12 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din1 = data_layer(name='data1', size=30)
+din2 = data_layer(name='data2', size=30)
+
+opts = []
+opts.append(seq_concat_layer(a=din1, b=din2))
+opts.append(seq_reshape_layer(input=din1, reshape_size=5))
+
+outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
new file mode 100644
index 0000000000000000000000000000000000000000..66629662dd9166766daaf707409b720f56ef1405
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_smooth_l1.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=300)
+smooth_l1 = smooth_l1_cost(input=data, label=lbl)
+
+outputs(smooth_l1)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
index ae275735aa2b852b3b226a4a0e5b2d4d000ba199..e6cd35ee761d1acd0b5c1943554c7ea1de6a13f5 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -39,6 +39,7 @@ z1 = mixed_layer(
 assert z1.size > 0
 
 y2 = fc_layer(input=y, size=15)
+z2 = rotate_layer(input=y2, height=5, width=3)
 
 cos1 = cos_sim(a=x1, b=y1)
 cos3 = cos_sim(a=x1, b=y2, size=3)
@@ -46,7 +47,7 @@ cos3 = cos_sim(a=x1, b=y2, size=3)
 linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
 
 out = fc_layer(
-    input=[cos1, cos3, linear_comb, z, z1],
+    input=[cos1, cos3, linear_comb, z, z1, z2],
     size=num_classes,
     act=SoftmaxActivation())
 
diff --git a/python/paddle/utils/dump_config.py b/python/paddle/utils/dump_config.py
index 73bf349c46726163d664c374aa47598871b90106..d27af7f76246a4c9db9a43c17715506d82031b9c 100644
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
@@ -20,6 +20,7 @@ __all__ = []
 
 if __name__ == '__main__':
     whole_conf = False
+    binary = False
     if len(sys.argv) == 2:
         conf = parse_config(sys.argv[1], '')
     elif len(sys.argv) == 3:
@@ -28,6 +29,8 @@ if __name__ == '__main__':
         conf = parse_config(sys.argv[1], sys.argv[2])
         if sys.argv[3] == '--whole':
             whole_conf = True
+        elif sys.argv[3] == '--binary':
+            binary = True
     else:
         raise RuntimeError()
 
@@ -36,4 +39,7 @@ if __name__ == '__main__':
     if whole_conf:
         print conf
     else:
-        print conf.model_config
+        if binary:
+            sys.stdout.write(conf.model_config.SerializeToString())
+        else:
+            print conf.model_config
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..851fe7060fd52120603ebabb4069d67471aa05d0
--- /dev/null
+++ b/python/paddle/v2/__init__.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import optimizer
+import layer
+import activation
+import parameters
+import trainer
+import event
+import data_type
+import topology
+import data_feeder
+import networks
+import evaluator
+from . import dataset
+from . import reader
+from . import plot
+import attr
+import pooling
+import inference
+import networks
+import py_paddle.swig_paddle as api
+import minibatch
+import plot
+import image
+
+__all__ = [
+    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
+    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
+    'topology', 'networks', 'infer', 'plot', 'evaluator', 'image'
+]
+
+
+def init(**kwargs):
+    args = []
+    args_dict = {}
+    # NOTE: append arguments if they are in ENV
+    for ek, ev in os.environ.iteritems():
+        if ek.startswith("PADDLE_INIT_"):
+            args_dict[ek.replace("PADDLE_INIT_", "").lower()] = str(ev)
+
+    args_dict.update(kwargs)
+    # NOTE: overwrite arguments from ENV if it is in kwargs
+    for key in args_dict.keys():
+        args.append('--%s=%s' % (key, str(args_dict[key])))
+
+    api.initPaddle(*args)
+
+
+infer = inference.infer
+batch = minibatch.batch
diff --git a/python/paddle/v2/activation.py b/python/paddle/v2/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..21261a178203b633ca6cf59a5fc89edc24a868b9
--- /dev/null
+++ b/python/paddle/v2/activation.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.activations
+import copy
+
+__all__ = []
+
+suffix = 'Activation'
+for act in paddle.trainer_config_helpers.activations.__all__:
+    new_name = act[:-len(suffix)]
+    globals()[new_name] = copy.copy(
+        getattr(paddle.trainer_config_helpers.activations, act))
+    globals()[new_name].__name__ = new_name
+    __all__.append(new_name)
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f78614e7f8abe7cffdc7a50a9fa77f1fc1a780
--- /dev/null
+++ b/python/paddle/v2/attr.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.attrs
+
+__all__ = [
+    "Param",
+    "Extra",
+]
+
+Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
+Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
+
+for each in paddle.trainer_config_helpers.attrs.__all__:
+    globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
+    __all__.append(each)
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..acda778e0aee1a8339ad6bd0d719868151d4fabe
--- /dev/null
+++ b/python/paddle/v2/config_base.py
@@ -0,0 +1,219 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import re
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+import paddle.trainer_config_helpers as conf_helps
+from topology import Topology
+
+
+class LayerType(type):
+    def __new__(cls, name, bases, attrs):
+        method_name = attrs.get('METHOD_NAME', None)
+        if method_name is not None:
+            method = getattr(conf_helps, method_name)
+            if method.__doc__ is not None:
+                mapper = attrs.get("__map_docstr__", None)
+                if mapper is not None:
+                    attrs['__doc__'] = LayerType.__map_docstr__(
+                        mapper(method.__doc__),
+                        method_name=method_name,
+                        name=name)
+                else:
+                    attrs['__doc__'] = LayerType.__map_docstr__(
+                        method.__doc__, method_name=method_name, name=name)
+        return super(LayerType, cls).__new__(cls, name, bases, attrs)
+
+    @staticmethod
+    def __map_docstr__(doc, name, method_name):
+        assert isinstance(doc, basestring)
+
+        # replace LayerOutput to paddle.v2.config_base.Layer
+        doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
+
+        doc = doc.replace('ParameterAttribute',
+                          'paddle.v2.attr.ParameterAttribute')
+
+        doc = re.sub(r'ExtraLayerAttribute[^\s]?',
+                     'paddle.v2.attr.ExtraAttribute', doc)
+
+        # xxx_layer to xxx
+        doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
+
+        # XxxxActivation to paddle.v2.Activation.Xxxx
+        doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
+                     r"paddle.v2.Activation.\g<name>", doc)
+
+        # TODO(yuyang18): Add more rules if needed.
+        return doc
+
+
+class Layer(object):
+    __metaclass__ = LayerType
+
+    def __init__(self, name=None, parent_layers=None):
+        assert isinstance(parent_layers, dict)
+        self.name = name
+        self.__context__ = {}
+        self.__parent_layers__ = parent_layers
+        # some layer may have some extra parent layer
+        self.__extra_parent__ = []
+        # used for evaluator.
+        self.__children_layers__ = []
+
+    def extra_parent(self):
+        return self.__extra_parent__
+
+    def append_extra_parent(self, parent):
+        self.__extra_parent__.append(parent)
+
+    def append_child(self, layer, parent_names):
+        self.__children_layers__.append((layer, parent_names))
+
+    def to_proto(self, context):
+        """
+        function to set proto attribute
+        """
+        self.__context__ = context
+
+        # STEP: short cut if this layer is parsed before.
+        if self.context_name() in context:
+            if self.use_context_name():
+                return context[self.context_name()]
+            else:
+                return context[self.name]
+
+        # STEP: parse extra_parent that is not used by this layer but must
+        # be parsed before this layer.
+        for p in self.__extra_parent__:
+            p.to_proto(context=context)
+
+        # STEP: parse parent that is used by this layer, get the result and
+        # insert into kwargs of the next layer's to_proto_impl method.
+        kwargs = dict()
+        for layer_name in self.__parent_layers__:
+            if not isinstance(self.__parent_layers__[layer_name],
+                              collections.Sequence):
+                v1_layer = self.__parent_layers__[layer_name].to_proto(
+                    context=context)
+            else:
+                v1_layer = map(lambda x: x.to_proto(context=context),
+                               self.__parent_layers__[layer_name])
+            kwargs[layer_name] = v1_layer
+
+        # STEP: parse myself and add myself into context.
+        ret_val = self.to_proto_impl(**kwargs)
+        if self.context_name() is not None \
+                and self.context_name() not in context:
+            context[self.context_name()] = ret_val
+
+        # STEP: parse children that should be pased after this layer.
+        for layer, pnames in self.__children_layers__:
+            drop = False
+
+            # child will only be parsed if all parents are in context.
+            for pname in pnames:
+                if pname not in context:
+                    drop = True
+                    break
+            if drop:
+                continue
+            layer.to_proto(context=context)
+
+        # STEP: return v1 layer result
+        if self.context_name() is None:
+            return ret_val
+        elif self.use_context_name():
+            return context[self.context_name()]
+        else:
+            return context[self.name]
+
+    def to_proto_impl(self, **kwargs):
+        raise NotImplementedError()
+
+    def context_name(self):
+        """
+        Context name means the context which stores `to_proto_impl` result.
+        If multiple layer share same context_name, the `to_proto_impl` of them
+        will be invoked only once.
+        """
+        return self.name
+
+    def use_context_name(self):
+        return False
+
+    def calculate_size(self):
+        """
+        lazy calculate size of the layer, should be called when to_proto_impl of
+        this layer is called.
+        :return:
+        """
+        return self.__context__[self.context_name()].size
+
+    def attr(self):
+        topo = Topology(self)
+        return topo.get_layer_proto(self.name)
+
+
+def __convert_to_v2__(method_name,
+                      parent_names,
+                      is_default_name=True,
+                      attach_parent=False):
+    if is_default_name:
+        wrapper = wrap_name_default(name_prefix=method_name)
+    else:
+        wrapper = None
+
+    class V2LayerImpl(Layer):
+        METHOD_NAME = method_name
+
+        def __init__(self, **kwargs):
+            parent_layers = dict()
+            other_kwargs = dict()
+            for pname in parent_names:
+                if pname in kwargs:
+                    parent_layers[pname] = kwargs[pname]
+
+            if attach_parent:
+                pnames = [x.context_name() for x in parent_layers.values()]
+
+                for pname in parent_layers:
+                    layers = kwargs[pname]
+                    if not isinstance(layers, collections.Sequence):
+                        layers = [layers]
+
+                    for layer in layers:
+                        layer.append_child(self, pnames)
+
+            for key in kwargs.keys():
+                if key not in parent_names:
+                    other_kwargs[key] = kwargs[key]
+
+            name = kwargs.get('name', None)
+            super(V2LayerImpl, self).__init__(name, parent_layers)
+            self.__other_kwargs__ = other_kwargs
+
+        if wrapper is not None:
+            __init__ = wrapper(__init__)
+
+        def to_proto_impl(self, **kwargs):
+            args = dict()
+            for each in kwargs:
+                args[each] = kwargs[each]
+            for each in self.__other_kwargs__:
+                args[each] = self.__other_kwargs__[each]
+            return getattr(conf_helps, method_name)(**args)
+
+    return V2LayerImpl
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..2698251b9e15046eb14f71c3f5b0546ecbb4a5dd
--- /dev/null
+++ b/python/paddle/v2/data_feeder.py
@@ -0,0 +1,134 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import DataProviderConverter
+import collections
+import paddle.trainer.PyDataProvider2 as pydp2
+
+__all__ = ['DataFeeder']
+
+
+def default_feeding_map(data_types):
+    reader_dict = dict()
+    for i, tp in enumerate(data_types):
+        reader_dict[tp[0]] = i
+    return reader_dict
+
+
+class DataFeeder(DataProviderConverter):
+    """
+    DataFeeder converts the data returned by paddle.reader into a data structure
+    of Arguments which is defined in the API. The paddle.reader usually returns
+    a list of mini-batch data entries. Each data entry in the list is one sample.
+    Each sample is a list or a tuple with one feature or multiple features.
+    DataFeeder converts this mini-batch data entries into Arguments in order
+    to feed it to C++ interface.
+    
+    The simple usage shows below
+
+    ..  code-block:: python
+
+        feeding = ['image', 'label']
+        data_types = enumerate_data_types_of_data_layers(topology)
+        feeder = DataFeeder(data_types=data_types, feeding=feeding)
+
+        minibatch_data = [([1.0, 2.0, 3.0, ...], 5)]
+
+        arg = feeder(minibatch_data)
+
+
+    If mini-batch data and data layers are not one to one mapping, we
+    could pass a dictionary to feeding parameter to represent the mapping
+    relationship.
+
+
+    ..  code-block:: python
+
+        data_types = [('image', paddle.data_type.dense_vector(784)),
+                      ('label', paddle.data_type.integer_value(10))]
+        feeding = {'image':0, 'label':1}
+        feeder = DataFeeder(data_types=data_types, feeding=feeding)
+        minibatch_data = [
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
+                         ]
+        # or minibatch_data = [
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
+        #                     ]
+        arg = feeder.convert(minibatch_data)
+
+    ..  note::
+
+        This module is for internal use only. Users should use the `reader`
+        interface.
+
+
+
+    :param data_types: A list to specify data name and type. Each item is
+                       a tuple of (data_name, data_type).
+
+    :type data_types: list
+    :param feeding: A dictionary or a sequence to specify the position of each
+                    data in the input data.
+    :type feeding: dict|collections.Sequence|None
+    """
+
+    def __init__(self, data_types, feeding=None):
+        self.input_names = []
+        input_types = []
+        if feeding is None:
+            feeding = default_feeding_map(data_types)
+        elif isinstance(feeding, collections.Sequence):
+            feed_list = feeding
+            feeding = dict()
+            for i, name in enumerate(feed_list):
+                feeding[name] = i
+        elif not isinstance(feeding, dict):
+            raise TypeError("Feeding should be dict or sequence or None.")
+
+        self.feeding = feeding
+        for each in data_types:
+            self.input_names.append(each[0])
+            if not isinstance(each[1], pydp2.InputType):
+                raise TypeError("second item in each data_type should be an "
+                                "InputType")
+            input_types.append(each[1])
+        DataProviderConverter.__init__(self, input_types)
+
+    def __len__(self):
+        return len(self.input_names)
+
+    def convert(self, dat, argument=None):
+        """
+        :param dat: A list of mini-batch data. Each sample is a list or tuple
+                    one feature or multiple features.
+
+        :type dat: list
+        :param argument: An Arguments object contains this mini-batch data with
+                         one or multiple features. The Arguments definition is
+                         in the API.
+        :type argument: py_paddle.swig_paddle.Arguments
+        """
+
+        def reorder_data(data):
+            retv = []
+            for each in data:
+                reorder = []
+                for name in self.input_names:
+                    reorder.append(each[self.feeding[name]])
+                retv.append(reorder)
+            return retv
+
+        return DataProviderConverter.convert(self, reorder_data(dat), argument)
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..d582f76ddf01ed3430a1d075624bbb8e0bf3f2a9
--- /dev/null
+++ b/python/paddle/v2/data_type.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.PyDataProvider2 as pydp2
+
+import_list = [
+    nm for nm in dir(pydp2)
+    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm)
+]
+import_list.extend(['InputType'])
+
+for nm in import_list:
+    globals()[nm] = getattr(pydp2, nm)
+
+__all__ = import_list
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ff6295c34e853d8f69b9e78719af23a56d1fbb
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+
+__all__ = [
+    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
+    'uci_housing', 'wmt14'
+]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..81af0a8e66a44a3476206147684d81bcac1be372
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,124 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset.
+
+This module will download dataset from
+https://www.cs.toronto.edu/~kriz/cifar.html and parse train/test set into
+paddle reader creators.
+
+The CIFAR-10 dataset consists of 60000 32x32 colour images in 10 classes,
+with 6000 images per class. There are 50000 training images and 10000 test
+images.
+
+The CIFAR-100 dataset is just like the CIFAR-10, except it has 100 classes
+containing 600 images each. There are 500 training images and 100 testing
+images per class.
+
+"""
+
+import cPickle
+import itertools
+import numpy
+from common import download
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    """
+    CIFAR-100 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 99].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
+
+
+def test100():
+    """
+    CIFAR-100 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
+
+
+def train10():
+    """
+    CIFAR-10 training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
+
+
+def test10():
+    """
+    CIFAR-10 test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
+
+
+def fetch():
+    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eb018b8d60e9a8bd0091836ab56c35b05786fca
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import shutil
+import sys
+import importlib
+import paddle.v2.dataset
+
+__all__ = ['DATA_HOME', 'download', 'md5file']
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+if not os.path.exists(DATA_HOME):
+    os.makedirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname, url.split('/')[-1])
+    if not (os.path.exists(filename) and md5file(filename) == md5sum):
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..12d648bf6557ed6e437320e56a80294abac29f18
--- /dev/null
+++ b/python/paddle/v2/dataset/conll05.py
@@ -0,0 +1,223 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Conll05 dataset.
+Paddle semantic role labeling Book and demo use this dataset as an example.
+Because Conll05 is not free in public, the default downloaded URL is test set
+of Conll05 (which is public). Users can change URL and MD5 to their Conll
+dataset. And a pre-trained word vector model based on Wikipedia corpus is used
+to initialize SRL model.
+"""
+
+import tarfile
+import gzip
+import itertools
+from common import download
+
+__all__ = ['test, get_dict', 'get_embedding']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    """
+    Get the word, verb and label dictionary of Wikipedia corpus.
+    """
+    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
+    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
+    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    """
+    Get the trained word vector based on Wikipedia corpus.
+    """
+    return download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    """
+    Conll05 test set creator.
+
+    Because the training dataset is not free, the test dataset is used for
+    training. It returns a reader creator, each sample in the reader is nine
+    features, including sentence sequence, predicate, predicate context,
+    predicate context flag and tagged sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    download(EMB_URL, 'conll05st', EMB_MD5)
+    download(DATA_URL, 'conll05st', DATA_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc5abfe53d90ec3adc9a27a49ed086953146497
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset.
+
+This module downloads IMDB dataset from
+http://ai.stanford.edu/%7Eamaas/data/sentiment/. This dataset contains a set
+of 25,000 highly polar movie reviews for training, and 25,000 for testing.
+Besides, this module also provides API for building dictionary.
+"""
+
+import paddle.v2.dataset.common
+import collections
+import tarfile
+import Queue
+import re
+import string
+import threading
+
+__all__ = ['build_dict', 'train', 'test']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+def tokenize(pattern):
+    """
+    Read files that match the given pattern.  Tokenize and yield each file.
+    """
+
+    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+                                                        MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    """
+    Build a word dictionary from the corpus. Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    word_freq = collections.defaultdict(int)
+    for doc in tokenize(pattern):
+        for word in doc:
+            word_freq[word] += 1
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+    UNK = word_idx['<unk>']
+
+    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
+
+    def load(pattern, queue):
+        for doc in tokenize(pattern):
+            queue.put(doc)
+        queue.put(None)
+
+    def reader():
+        # Creates two threads that loads positive and negative samples
+        # into qs.
+        t0 = threading.Thread(
+            target=load, args=(
+                pos_pattern,
+                qs[0], ))
+        t0.daemon = True
+        t0.start()
+
+        t1 = threading.Thread(
+            target=load, args=(
+                neg_pattern,
+                qs[1], ))
+        t1.daemon = True
+        t1.start()
+
+        # Read alternatively from qs[0] and qs[1].
+        i = 0
+        doc = qs[i].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            i += 1
+            doc = qs[i % 2].get()
+
+        # If any queue is empty, reads from the other queue.
+        i += 1
+        doc = qs[i % 2].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            doc = qs[i % 2].get()
+
+    return reader()
+
+
+def train(word_idx):
+    """
+    IMDB training set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+
+
+def test(word_idx):
+    """
+    IMDB test set creator.
+
+    It returns a reader creator, each sample in the reader is an zero-based ID
+    sequence and label in [0, 1].
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
+
+
+def word_dict():
+    """
+    Build a word dictionary from the corpus.
+
+    :return: Word dictionary
+    :rtype: dict
+    """
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd3a4552d2e1a2b00dde5ddb7ac1d78445bdca51
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset.
+
+This module will download dataset from 
+http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
+into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import collections
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+class DataType(object):
+    NGRAM = 1
+    SEQ = 2
+
+
+def word_count(f, word_freq=None):
+    if word_freq is None:
+        word_freq = collections.defaultdict(int)
+
+    for l in f:
+        for w in l.strip().split():
+            word_freq[w] += 1
+        word_freq['<s>'] += 1
+        word_freq['<e>'] += 1
+
+    return word_freq
+
+
+def build_dict(min_word_freq=50):
+    """
+    Build a word dictionary from the corpus,  Keys of the dictionary are words,
+    and values are zero-based IDs of these words.
+    """
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.v2.dataset.common.download(
+                paddle.v2.dataset.imikolov.URL, 'imikolov',
+                paddle.v2.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        word_freq = filter(lambda x: x[1] > min_word_freq, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n, data_type):
+    def reader():
+        with tarfile.open(
+                paddle.v2.dataset.common.download(
+                    paddle.v2.dataset.imikolov.URL, 'imikolov',
+                    paddle.v2.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                if DataType.NGRAM == data_type:
+                    assert n > -1, 'Invalid gram length'
+                    l = ['<s>'] + l.strip().split() + ['<e>']
+                    if len(l) >= n:
+                        l = [word_idx.get(w, UNK) for w in l]
+                        for i in range(n, len(l) + 1):
+                            yield tuple(l[i - n:i])
+                elif DataType.SEQ == data_type:
+                    l = l.strip().split()
+                    l = [word_idx.get(w, UNK) for w in l]
+                    src_seq = [word_idx['<s>']] + l
+                    trg_seq = l + [word_idx['<e>']]
+                    if n > 0 and len(src_seq) > n: continue
+                    yield src_seq, trg_seq
+                else:
+                    assert False, 'Unknow data type'
+
+    return reader
+
+
+def train(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov training set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n,
+                          data_type)
+
+
+def test(word_idx, n, data_type=DataType.NGRAM):
+    """
+    imikolov test set creator.
+
+    It returns a reader creator, each sample in the reader is a word ID
+    tuple.
+
+    :param word_idx: word dictionary
+    :type word_idx: dict
+    :param n: sliding window size if type is ngram, otherwise max length of sequence
+    :type n: int
+    :param data_type: data type (ngram or sequence)
+    :type data_type: member variable of DataType (NGRAM or SEQ)
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n,
+                          data_type)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..435556b2921b7976bbc61160ce3812949981c9e7
--- /dev/null
+++ b/python/paddle/v2/dataset/mnist.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse training set and test set into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST training set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                          TRAIN_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                          TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
+                                          TEST_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
+                                          TEST_LABEL_MD5), 100)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..837a85912663826f0483aff4f6a38f3945375d82
--- /dev/null
+++ b/python/paddle/v2/dataset/movielens.py
@@ -0,0 +1,253 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+Movielens 1-M dataset contains 1 million ratings from 6000 users on 4000
+movies, which was collected by GroupLens Research. This module will download
+Movielens 1-M dataset from 
+http://files.grouplens.org/datasets/movielens/ml-1m.zip and parse training
+set and test set into paddle reader creators.
+
+"""
+
+import zipfile
+from common import download
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
+
+class MovieInfo(object):
+    """
+    Movie id, title and categories information are stored in MovieInfo.
+    """
+
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        """
+        Get information from a movie.
+        """
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    """
+    User id, gender, age, and job information are stored in UserInfo.
+    """
+
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        """
+        Get information from a user.
+        """
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = download(URL, "movielens", MD5)
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    """
+    Get movie title dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    """
+    Get the maximum value of movie id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    """
+    Get the maximum value of user id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    """
+    Get the maximum value of job id.
+    """
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    """
+    Get movie categoriges dictionary.
+    """
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    """
+    Get user info dictionary.
+    """
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    """
+    Get movie info dictionary.
+    """
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+def fetch():
+    download(URL, "movielens", MD5)
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..4dd34e7383fe2a290fcf61474914183a383e2b9c
--- /dev/null
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -0,0 +1,131 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import common
+
+__all__ = ['train', 'test', 'get_word_dict']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default training set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5babcef0eb4345d243904877d323c37d4889a643
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.common
+import unittest
+import tempfile
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.v2.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.v2.dataset.common.download(
+                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e52810e6b924e0796e3d836dbbcb27ede2c9e25
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -0,0 +1,53 @@
+import paddle.v2.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
+
+        first_line = 'aer banknote berlitz calloway centrust cluett fromstein '\
+            'gitano guterman hydro-quebec ipo kia memotec mlx nahb punts '\
+            'rake regatta rubens sim snack-food ssangyong swapo wachter'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.v2.dataset.imikolov.train(
+                WORD_DICT, n=-1,
+                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
+
+        first_line = 'consumers may want to move their telephones a little '\
+                'closer to the tv set'
+        first_line = [
+            WORD_DICT.get(ch, WORD_DICT['<unk>'])
+            for ch in first_line.split(' ')
+        ]
+        for l in paddle.v2.dataset.imikolov.test(
+                WORD_DICT, n=-1,
+                data_type=paddle.v2.dataset.imikolov.DataType.SEQ)():
+            read_line = l[0][1:]
+            break
+        self.assertEqual(first_line, read_line)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.v2.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..3469fd9ce12dd4d934004f90286979b73048a5c8
--- /dev/null
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+This module will download dataset from
+https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
+parse training set and test set into paddle reader creators.
+"""
+
+import numpy as np
+import os
+from common import download
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    """
+    UCI_HOUSING training set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    global UCI_TRAIN_DATA
+    load_data(download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    """
+    UCI_HOUSING test set creator.
+
+    It returns a reader creator, each sample in the reader is features after
+    normalization and price number.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    global UCI_TEST_DATA
+    load_data(download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def fetch():
+    download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..0902f87741c342b237439081703081b467dc6f35
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+WMT14 dataset.
+The original WMT14 dataset is too large and a small set of data for set is
+provided. This module will download dataset from
+http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
+parse training set and test set into paddle reader creators.
+
+"""
+import tarfile
+import gzip
+
+from paddle.v2.dataset.common import download
+from paddle.v2.parameters import Parameters
+
+__all__ = ['train', 'test', 'build_dict']
+
+URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and will be add later.
+URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
+# this is the pretrained model, whose bleu = 26.92
+URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
+MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict__(tar_file, dict_size):
+    def __to_dict__(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    """
+    WMT14 training set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Training reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
+
+
+def test(dict_size):
+    """
+    WMT14 test set creator.
+
+    It returns a reader creator, each sample in the reader is source language
+    word ID sequence, target language word ID sequence and next word ID
+    sequence.
+
+    :return: Test reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
+
+
+def gen(dict_size):
+    return reader_creator(
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'gen/gen', dict_size)
+
+
+def model():
+    tar_file = download(URL_MODEL, 'wmt14', MD5_MODEL)
+    with gzip.open(tar_file, 'r') as f:
+        parameters = Parameters.from_tar(f)
+    return parameters
+
+
+def get_dict(dict_size, reverse=True):
+    # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
+    # else reverse = true, return dict = {'001':'a', '002':'b', ...}
+    tar_file = download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+    if reverse:
+        src_dict = {v: k for k, v in src_dict.items()}
+        trg_dict = {v: k for k, v in trg_dict.items()}
+    return src_dict, trg_dict
+
+
+def fetch():
+    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
+    download(URL_MODEL, 'wmt14', MD5_MODEL)
diff --git a/python/paddle/v2/evaluator.py b/python/paddle/v2/evaluator.py
new file mode 100644
index 0000000000000000000000000000000000000000..588eefa3912799aa55f970c6d7e013ed7779ec9a
--- /dev/null
+++ b/python/paddle/v2/evaluator.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.evaluators as evs
+import inspect
+from config_base import __convert_to_v2__
+
+__all__ = []
+
+
+def initialize():
+    def convert_to_new_name(nm):
+        return nm[:-len("_evaluator")]
+
+    for __ev_name__ in filter(lambda x: x.endswith('_evaluator'), evs.__all__):
+        __ev__ = getattr(evs, __ev_name__)
+        if hasattr(__ev__, 'argspec'):
+            argspec = __ev__.argspec
+        else:
+            argspec = inspect.getargspec(__ev__)
+        parent_names = filter(lambda x: x in ['input', 'label', 'weight'],
+                              argspec.args)
+        v2_ev = __convert_to_v2__(
+            __ev_name__,
+            parent_names=parent_names,
+            is_default_name='name' in argspec.args,
+            attach_parent=True)
+
+        __new_name__ = convert_to_new_name(__ev_name__)
+
+        globals()[__new_name__] = v2_ev
+        globals()[__new_name__].__name__ = __new_name__
+        __all__.append(__new_name__)
+
+
+initialize()
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6050fa339d280ad54e40128ea6bae25132c873
--- /dev/null
+++ b/python/paddle/v2/event.py
@@ -0,0 +1,83 @@
+"""
+Testing and training events.
+
+There are:
+
+* TestResult
+* BeginIteration
+* EndIteration
+* BeginPass
+* EndPass
+"""
+import py_paddle.swig_paddle as api
+
+__all__ = [
+    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
+]
+
+
+class WithMetric(object):
+    def __init__(self, evaluator):
+        if not isinstance(evaluator, api.Evaluator):
+            raise TypeError("Evaluator should be api.Evaluator type")
+        self.__evaluator__ = evaluator
+
+    @property
+    def metrics(self):
+        names = self.__evaluator__.getNames()
+        retv = dict()
+        for each_name in names:
+            val = self.__evaluator__.getValue(each_name)
+            retv[each_name] = val
+        return retv
+
+
+class TestResult(WithMetric):
+    """
+    Result that trainer.test return.
+    """
+
+    def __init__(self, evaluator, cost):
+        super(TestResult, self).__init__(evaluator)
+        self.cost = cost
+
+
+class BeginPass(object):
+    """
+    Event On One Pass Training Start.
+    """
+
+    def __init__(self, pass_id):
+        self.pass_id = pass_id
+
+
+class EndPass(WithMetric):
+    """
+    Event On One Pass Training Complete.
+    """
+
+    def __init__(self, pass_id, evaluator):
+        self.pass_id = pass_id
+        WithMetric.__init__(self, evaluator)
+
+
+class BeginIteration(object):
+    """
+    Event On One Batch Training Start.
+    """
+
+    def __init__(self, pass_id, batch_id):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+
+
+class EndIteration(WithMetric):
+    """
+    Event On One Batch Training Complete.
+    """
+
+    def __init__(self, pass_id, batch_id, cost, evaluator):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+        self.cost = cost
+        WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/image.py b/python/paddle/v2/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..13f53919aa49694f722d4bf20a7d01af3e3e6533
--- /dev/null
+++ b/python/paddle/v2/image.py
@@ -0,0 +1,236 @@
+import numpy as np
+try:
+    import cv2
+except ImportError:
+    cv2 = None
+
+from cv2 import resize
+
+__all__ = [
+    "load_image", "resize_short", "to_chw", "center_crop", "random_crop",
+    "left_right_flip", "simple_transform", "load_and_transform"
+]
+"""
+This file contains some common interfaces for image preprocess.
+Many users are confused about the image layout. We introduce
+the image layout as follows.
+
+- CHW Layout
+  - The abbreviations: C=channel, H=Height, W=Width
+  - The default layout of image opened by cv2 or PIL is HWC.
+    PaddlePaddle only supports the CHW layout. And CHW is simply
+    a transpose of HWC. It must transpose the input image.
+
+- Color format: RGB or BGR
+  OpenCV use BGR color format. PIL use RGB color format. Both
+  formats can be used for training. Noted that, the format should
+  be keep consistent between the training and inference peroid.
+"""
+
+
+def load_image(file, is_color=True):
+    """
+    Load an color or gray image from the file path.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = load_image('cat.jpg')
+
+    :param file: the input image path.
+    :type file: string
+    :param is_color: If set is_color True, it will load and
+                     return a color image. Otherwise, it will
+                     load and return a gray image.
+    """
+    # cv2.IMAGE_COLOR for OpenCV3
+    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
+    # cv2.IMAGE_GRAYSCALE for OpenCV3
+    # cv2.CV_LOAD_IMAGE_GRAYSCALE for older OpenCV Version
+    # Here, use constant 1 and 0
+    # 1: COLOR, 0: GRAYSCALE
+    flag = 1 if is_color else 0
+    im = cv2.imread(file, flag)
+    return im
+
+
+def resize_short(im, size):
+    """ 
+    Resize an image so that the length of shorter edge is size.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the shorter edge size of image after resizing.
+    :type size: int
+    """
+    assert im.shape[-1] == 1 or im.shape[-1] == 3
+    h, w = im.shape[:2]
+    h_new, w_new = size, size
+    if h > w:
+        h_new = size * h / w
+    else:
+        w_new = size * w / h
+    im = resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
+    return im
+
+
+def to_chw(im, order=(2, 0, 1)):
+    """
+    Transpose the input image order. The image layout is HWC format
+    opened by cv2 or PIL. Transpose the input image to CHW layout
+    according the order (2,0,1).
+
+    Example usage:
+    
+    .. code-block:: python
+        im = load_image('cat.jpg')
+        im = resize_short(im, 256)
+        im = to_chw(im)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param order: the transposed order.
+    :type order: tuple|list 
+    """
+    assert len(im.shape) == len(order)
+    im = im.transpose(order)
+    return im
+
+
+def center_crop(im, size, is_color=True):
+    """
+    Crop the center of image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = center_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = (h - size) / 2
+    w_start = (w - size) / 2
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def random_crop(im, size, is_color=True):
+    """
+    Randomly crop input image with size.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = random_crop(im, 224)
+    
+    :param im: the input image with HWC layout.
+    :type im: ndarray
+    :param size: the cropping size.
+    :type size: int
+    :param is_color: whether the image is color or not.
+    :type is_color: bool
+    """
+    h, w = im.shape[:2]
+    h_start = np.random.randint(0, h - size + 1)
+    w_start = np.random.randint(0, w - size + 1)
+    h_end, w_end = h_start + size, w_start + size
+    if is_color:
+        im = im[h_start:h_end, w_start:w_end, :]
+    else:
+        im = im[h_start:h_end, w_start:w_end]
+    return im
+
+
+def left_right_flip(im):
+    """
+    Flip an image along the horizontal direction.
+    Return the flipped image.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = left_right_flip(im)
+    
+    :paam im: input image with HWC layout
+    :type im: ndarray
+    """
+    if len(im.shape) == 3:
+        return im[:, ::-1, :]
+    else:
+        return im[:, ::-1, :]
+
+
+def simple_transform(im, resize_size, crop_size, is_train, is_color=True):
+    """
+    Simply data argumentation for training. These operations include
+    resizing, croping and flipping.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = simple_transform(im, 256, 224, True)
+
+    :param im: The input image with HWC layout.
+    :type im: ndarray
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    """
+    im = resize_short(im, resize_size)
+    if is_train:
+        im = random_crop(im, crop_size)
+        if np.random.randint(2) == 0:
+            im = left_right_flip(im)
+    else:
+        im = center_crop(im, crop_size)
+    im = to_chw(im)
+
+    return im
+
+
+def load_and_transform(filename,
+                       resize_size,
+                       crop_size,
+                       is_train,
+                       is_color=True):
+    """
+    Load image from the input file `filename` and transform image for
+    data argumentation. Please refer to the `simple_transform` interface
+    for the transform operations.
+
+    Example usage:
+    
+    .. code-block:: python
+        im = load_and_transform('cat.jpg', 256, 224, True)
+
+    :param filename: The file name of input image.
+    :type filename: string
+    :param resize_size: The shorter edge length of the resized image.
+    :type resize_size: int
+    :param crop_size: The cropping size.
+    :type crop_size: int
+    :param is_train: Whether it is training or not.
+    :type is_train: bool
+    """
+    im = load_image(filename)
+    im = simple_transform(im, resize_size, crop_size, is_train, is_color)
+    return im
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bb38496937bb6fb520334331c619f9b6f64b51
--- /dev/null
+++ b/python/paddle/v2/inference.py
@@ -0,0 +1,120 @@
+import numpy
+import py_paddle.swig_paddle as api
+import collections
+import topology
+import minibatch
+from data_feeder import DataFeeder
+
+__all__ = ['infer', 'Inference']
+
+
+class Inference(object):
+    """
+    Inference combines neural network output and parameters together
+    to do inference.
+    
+    ..  code-block:: python
+    
+        inferer = Inference(output_layer=prediction, parameters=parameters)
+        for data_batch in batches:
+            print inferer.infer(data_batch)
+
+
+    :param output_layer: The neural network that should be inferenced.
+    :type output_layer: paddle.v2.config_base.Layer or the sequence
+                        of paddle.v2.config_base.Layer
+    :param parameters: The parameters dictionary.
+    :type parameters: paddle.v2.parameters.Parameters
+    """
+
+    def __init__(self, output_layer, parameters):
+        topo = topology.Topology(output_layer)
+        gm = api.GradientMachine.createFromConfigProto(
+            topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
+        for param in gm.getParameters():
+            val = param.getBuf(api.PARAMETER_VALUE)
+            name = param.getName()
+            assert isinstance(val, api.Vector)
+            val.copyFromNumpyArray(parameters.get(name).flatten())
+        self.__gradient_machine__ = gm
+        self.__data_types__ = topo.data_type()
+
+    def iter_infer(self, input, feeding=None):
+        feeder = DataFeeder(self.__data_types__, feeding)
+        batch_size = len(input)
+
+        def __reader_impl__():
+            for each_sample in input:
+                yield each_sample
+
+        reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
+
+        self.__gradient_machine__.start()
+        for data_batch in reader():
+            yield self.__gradient_machine__.forwardTest(feeder(data_batch))
+        self.__gradient_machine__.finish()
+
+    def iter_infer_field(self, field, **kwargs):
+        if not isinstance(field, list) and not isinstance(field, tuple):
+            field = [field]
+
+        for result in self.iter_infer(**kwargs):
+            for each_result in result:
+                item = [each_result[each_field] for each_field in field]
+                yield item
+
+    def infer(self, input, field='value', **kwargs):
+        """
+        Infer a data by model.
+        :param input: input data batch. Should be python iterable object.
+        :param field: output field.
+        """
+        retv = None
+        kwargs['input'] = input
+        for result in self.iter_infer_field(field=field, **kwargs):
+            if retv is None:
+                retv = [[] for i in xrange(len(result))]
+            for i, item in enumerate(result):
+                retv[i].append(item)
+        retv = [numpy.concatenate(out) for out in retv]
+        if len(retv) == 1:
+            return retv[0]
+        else:
+            return retv
+
+
+def infer(output_layer, parameters, input, feeding=None, field='value'):
+    """
+    Infer a neural network by given neural network output and parameters.  The
+    user should pass either a batch of input data or reader method.
+
+    Example usages:
+
+    ..  code-block:: python
+
+        result = paddle.infer(output_layer=prediction, 
+                              parameters=parameters, 
+                              input=SomeData)
+        print result
+
+    :param output_layer: output of the neural network that would be inferred
+    :type output_layer: paddle.v2.config_base.Layer
+    :param parameters: parameters of the neural network.
+    :type parameters: paddle.v2.parameters.Parameters
+    :param input: input data batch. Should be a python iterable object, and each
+                  element is the data batch.
+    :type input: collections.Iterable
+    :param feeding: Reader dictionary. Default could generate from input
+                        value.
+    :param field: The prediction field. It should in [`value`, `id`, `prob`]. 
+                  `value` and `prob` mean return the prediction probabilities, 
+                  `id` means return the prediction labels. Default is `value`.
+                  Note that `prob` only used when output_layer is beam_search 
+                  or max_id.
+    :type field: str
+    :return: a numpy array
+    :rtype: numpy.ndarray
+    """
+
+    inferer = Inference(output_layer=output_layer, parameters=parameters)
+    return inferer.infer(field=field, input=input, feeding=feeding)
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..89cca7acd34b8dea0572169338649b5e9ff6536a
--- /dev/null
+++ b/python/paddle/v2/layer.py
@@ -0,0 +1,615 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+`paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
+we want to make Paddle a plain Python package. The model config package defined
+the way how to configure a neural network topology in Paddle Python code.
+
+The primary usage shows below.
+
+..  code-block:: python
+
+    import paddle.v2 as paddle
+
+    img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
+    hidden = paddle.layer.fc(input=img, size=200)
+    prediction = paddle.layer.fc(input=hidden, size=10,
+                                 act=paddle.activation.Softmax())
+
+    # use prediction instance where needed.
+    parameters = paddle.parameters.create(cost)
+"""
+
+import collections
+import inspect
+import re
+
+import paddle.trainer_config_helpers as conf_helps
+from paddle.trainer.config_parser import \
+    RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
+    RecurrentLayerGroupEnd, model_type
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as __parse__
+from paddle.trainer_config_helpers.default_decorators import wrap_act_default
+from paddle.trainer_config_helpers.default_decorators import \
+    wrap_bias_attr_default
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+from paddle.trainer_config_helpers.layers import RecurrentLayerGroupSetGenerator, Generator
+from paddle.trainer_config_helpers.layers import layer_support
+
+import activation
+import attr
+import data_type
+from config_base import Layer, __convert_to_v2__
+
+__all__ = ['parse_network', 'data']
+
+
+def parse_network(output_layers, extra_layers=None):
+    """
+    Parse all layers in the neural network graph and
+    then generate a ModelConfig object.
+
+    ..  note::
+
+        This function is used internally in paddle.v2 module. User should never
+        invoke this method.
+
+    :param output_layers: Output layers.
+    :type output_layers: Layer
+    :param extra_layers: Some layers in the neural network graph are not in the
+                         path of output_layers.
+    :type extra_layers: Layer
+    :return: A ModelConfig object instance.
+    :rtype: ModelConfig
+    """
+    if not isinstance(output_layers, collections.Sequence):
+        output_layers = [output_layers]
+    if extra_layers is not None and not isinstance(extra_layers,
+                                                   collections.Sequence):
+        extra_layers = [extra_layers]
+
+    def __real_func__():
+        """
+        __real_func__ is the function that config_parser.parse invoked. It is
+        the plain old paddle configuration function.
+        """
+        context = dict()
+        real_output = [each.to_proto(context=context) for each in output_layers]
+        if extra_layers is not None:
+            extra_output = [
+                each.to_proto(context=context) for each in extra_layers
+            ]
+        conf_helps.outputs(real_output)
+
+    return __parse__(__real_func__)
+
+
+"""
+Some layer may need some special config, and can not use __convert_to_v2__ to convert.
+So we also need to implement some special LayerV2.
+"""
+
+
+class DataLayerV2(Layer):
+    METHOD_NAME = 'data_layer'
+
+    def __init__(self, name, type, **kwargs):
+        assert isinstance(type, data_type.InputType)
+
+        self.type = type
+        self.__method_name__ = 'data_layer'
+        self.__kwargs__ = kwargs
+
+        super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        args['size'] = self.type.dim
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+        return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
+
+    def __map_docstr__(doc):
+        doc = re.sub(r'(data = [^\)]+)\).*',
+                     "data = paddle.layer.data(name=\"input\", "
+                     "type=paddle.data_type.dense_vector(1000))", doc)
+
+        doc = re.sub(r':param size:.*',
+                     ':param type: Data type of this data layer', doc)
+        doc = re.sub(r':type size:.*',
+                     ":type size: paddle.v2.data_type.InputType", doc)
+        return doc
+
+
+class MemoryV2(Layer):
+    def __init__(self, name, extra_input=None, **kwargs):
+        """
+        Init memory object, if memory is inited inside recurrent_group step
+        function, it may depend on a boot_layer that should be initialized
+        outside recurrent_group, so we:
+            1. add RecurrentLayerInput to extra_parent of self.
+            2. add boot_layer to the extra_parent of RecurrentLayerInput.
+
+        :param extra_input: list of RecurrentLayerInput
+        :type extra_input: [RecurrentLayerInput]
+        """
+        self.name = name
+        super(MemoryV2, self).__init__(name=name, parent_layers=dict())
+        self.__kwargs__ = kwargs
+        self.__boot_layer_name__ = None
+
+        if 'boot_layer' in kwargs:
+            begin_of_current_rnn = []
+            # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
+            # function inside step.
+            st = inspect.stack()
+            for i in xrange(len(st)):
+                locs = inspect.stack()[i][0].f_locals
+                keys = locs.keys()
+                for key in keys:
+                    val = locs[key]
+                    if isinstance(val, RecurrentLayerInput):
+                        begin_of_current_rnn.append(val)
+                    elif isinstance(val, collections.Sequence):
+                        for v in val:
+                            if isinstance(v, RecurrentLayerInput):
+                                begin_of_current_rnn.append(v)
+
+                if begin_of_current_rnn:
+                    break
+            assert begin_of_current_rnn is not None
+            for extra in begin_of_current_rnn:
+                self.append_extra_parent(extra)
+                extra.append_extra_parent(kwargs['boot_layer'])
+                self.__boot_layer_name__ = kwargs['boot_layer'].name
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+
+        if self.__boot_layer_name__ is not None:
+            args['boot_layer'] = self.__context__[self.__boot_layer_name__]
+
+        size = args.get('size', None)
+        if size is not None:
+            if callable(size):
+                real_size = size()
+            else:
+                real_size = size
+            args['size'] = real_size
+        return conf_helps.memory(name=self.name, **args)
+
+    def context_name(self):
+        return self.name + "#memory"
+
+    def use_context_name(self):
+        """
+        memory layer will have the same name with some layer
+        :return:
+        """
+        return True
+
+
+class StaticInputV2(object):
+    def __init__(self, input, is_seq=False, size=None):
+        assert isinstance(input, LayerV2)
+        self.name = input.name
+        self.input = input
+        self.is_seq = is_seq
+        self.size = size
+        # TODO(add size check)
+        # assert input.size is not None or size is not None
+
+
+class BaseGeneratedInputV2(object):
+    def __init__(self):
+        self.bos_id = None
+        self.eos_id = None
+
+    def before_real_step(self):
+        raise NotImplementedError()
+
+    def after_real_step(self, *args):
+        raise NotImplementedError()
+
+
+class GeneratedInputV2(BaseGeneratedInputV2):
+    def __init__(self, size, embedding_name, embedding_size):
+        super(GeneratedInputV2, self).__init__()
+        self.size = size
+        self.embedding_name = embedding_name
+        self.embedding_size = embedding_size
+
+    def after_real_step(self, input):
+        return max_id(input=input, name='__beam_search_predict__')
+
+    def before_real_step(self):
+        predict_id = memory(
+            name='__beam_search_predict__',
+            size=self.size,
+            boot_with_const_id=self.bos_id)
+
+        trg_emb = embedding(
+            input=predict_id,
+            size=self.embedding_size,
+            param_attr=attr.ParamAttr(name=self.embedding_name))
+        return trg_emb
+
+
+class RecurrentLayerGroupSetGeneratorV2(Layer):
+    def __init__(self, eos_name, max_length, beam_size, num_results_per_sample):
+        self.eos_name = eos_name
+        self.max_length = max_length
+        self.beam_size = beam_size
+        self.num_results_per_sample = num_results_per_sample
+        super(RecurrentLayerGroupSetGeneratorV2, self).__init__(
+            name=eos_name, parent_layers={})
+
+    def to_proto_impl(self, **kwargs):
+        RecurrentLayerGroupSetGenerator(
+            Generator(
+                eos_layer_name=self.eos_name,
+                max_num_frames=self.max_length,
+                beam_size=self.beam_size,
+                num_results_per_sample=self.num_results_per_sample))
+        return self
+
+    def context_name(self):
+        return self.eos_name + ".fake"
+
+    def use_context_name(self):
+        return True
+
+
+class MixedLayerV2(Layer):
+    """
+    This class is use to support `with` grammar. If not, the following code
+    could convert mixed_layer simply.
+
+        mixed = __convert_to_v2__(
+            'mixed_layer', name_prefix='mixed', parent_names=['input'])
+    """
+
+    class AddToSealedMixedLayerExceptionV2(Exception):
+        pass
+
+    def __init__(self,
+                 size=0,
+                 input=None,
+                 name=None,
+                 act=None,
+                 bias_attr=None,
+                 layer_attr=None):
+        self.__method_name__ = 'mixed_layer'
+        self.finalized = False
+        self.__inputs__ = []
+        if input is not None:
+            self.__inputs__ = input
+
+        other_kwargs = dict()
+        other_kwargs['name'] = name
+        other_kwargs['size'] = size
+        other_kwargs['act'] = act
+        other_kwargs['bias_attr'] = bias_attr
+        other_kwargs['layer_attr'] = layer_attr
+        parent_layers = {"input": self.__inputs__}
+        super(MixedLayerV2, self).__init__(name, parent_layers)
+        self.__other_kwargs__ = other_kwargs
+
+    def __iadd__(self, other):
+        if not self.finalized:
+            self.__inputs__.append(other)
+            return self
+        else:
+            raise MixedLayerV2.AddToSealedMixedLayerExceptionV2()
+
+    def __enter__(self):
+        assert len(self.__inputs__) == 0
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.finalized = True
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__other_kwargs__:
+            args[each] = self.__other_kwargs__[each]
+        size = args.get('size', None)
+        if size is not None:
+            if callable(size):
+                real_size = size()
+            else:
+                real_size = size
+            args['size'] = real_size
+        return getattr(conf_helps, self.__method_name__)(**args)
+
+
+@wrap_name_default("mixed")
+@wrap_act_default(act=activation.Linear())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support(conf_helps.layers.ERROR_CLIPPING, conf_helps.layers.DROPOUT)
+def mixed(size=0,
+          name=None,
+          input=None,
+          act=None,
+          bias_attr=False,
+          layer_attr=None):
+    return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
+
+
+mixed.__doc__ = conf_helps.mixed_layer.__doc__
+
+
+class RecurrentLayerInput(Layer):
+    def __init__(self, recurrent_name, index, parent_layers):
+        parents_len = len(parent_layers)
+        assert parents_len <= 1
+        if parents_len == 0:
+            self.__parents__ = []
+        else:
+            self.__parents__ = parent_layers.values()[0]
+        self.__recurrent_name__ = recurrent_name
+        name = self.__parents__[
+            index].name if index >= 0 else self.context_name()
+        super(RecurrentLayerInput, self).__init__(
+            name=name, parent_layers=parent_layers)
+
+    def context_name(self):
+        return self.__recurrent_name__ + ".begin"
+
+    def to_proto_impl(self, **kwargs):
+        model_type('recurrent_nn')
+        RecurrentLayerGroupWithoutOutLinksBegin(
+            name=self.__recurrent_name__,
+            in_links=map(lambda x: x.name, self.__parents__))
+        return self
+
+
+class RecurrentLayerOutput(Layer):
+    def __init__(self, recurrent_name, index, parent_layers):
+        assert len(parent_layers) == 1
+        self.__parents__ = parent_layers.values()[0]
+        super(RecurrentLayerOutput, self).__init__(
+            name=self.__parents__[index].name, parent_layers=parent_layers)
+        self.__recurrent_name__ = recurrent_name
+
+    def context_name(self):
+        return self.__recurrent_name__ + ".end"
+
+    def to_proto_impl(self, **kwargs):
+        for l in self.__parents__:
+            RecurrentLayerGroupSetOutLink(l.name)
+        RecurrentLayerGroupEnd(name=self.__recurrent_name__)
+
+
+LayerV2 = Layer
+data = DataLayerV2
+data.__name__ = 'data'
+AggregateLevel = conf_helps.layers.AggregateLevel
+ExpandLevel = conf_helps.layers.ExpandLevel
+memory = MemoryV2
+memory.__name__ = 'memory'
+memory.__doc__ = conf_helps.memory.__doc__
+
+
+def __layer_name_mapping__(inname):
+    if inname in ['data_layer', 'memory', 'mixed_layer', 'recurrent_group']:
+        # Do Not handle these layers
+        return
+    elif inname == 'maxid_layer':
+        return 'max_id'
+    elif inname.endswith('memory') or inname.endswith(
+            '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
+        return inname
+    elif inname in [
+            'cross_entropy', 'multi_binary_label_cross_entropy',
+            'cross_entropy_with_selfnorm'
+    ]:
+        return inname + "_cost"
+    elif inname.endswith('_cost'):
+        return inname
+    elif inname.endswith("_layer"):
+        return inname[:-len("_layer")]
+
+
+def __layer_name_mapping_parent_names__(inname):
+    all_args = getattr(conf_helps, inname).argspec.args
+    return filter(
+        lambda x: x in ['input1', 'input2', 'label', 'input', 'a', 'b',
+                        'expand_as',
+                        'weights', 'vectors', 'weight', 'score', 'left',
+                        'right', 'output_mem'],
+        all_args)
+
+
+def __convert_layer__(_new_name_, _old_name_, _parent_names_):
+    global __all__
+    __all__.append(_new_name_)
+    globals()[new_name] = __convert_to_v2__(_old_name_, _parent_names_)
+    globals()[new_name].__name__ = new_name
+
+
+for each_layer_name in dir(conf_helps):
+    new_name = __layer_name_mapping__(each_layer_name)
+    if new_name is not None:
+        parent_names = __layer_name_mapping_parent_names__(each_layer_name)
+        assert len(parent_names) != 0, each_layer_name
+        __convert_layer__(new_name, each_layer_name, parent_names)
+
+del parent_names
+del new_name
+del each_layer_name
+
+
+@wrap_name_default()
+def recurrent_group(step, input, name=None):
+    if not isinstance(input, collections.Sequence):
+        input = [input]
+
+    non_static_inputs = filter(lambda x: not isinstance(x, StaticInputV2),
+                               input)
+    actual_input = [
+        RecurrentLayerInput(
+            recurrent_name=name,
+            index=i,
+            parent_layers={'recurrent_inputs': non_static_inputs})
+        for i in xrange(len(non_static_inputs))
+    ]
+
+    extra_input = None
+    if len(non_static_inputs) == 0:
+        extra_input = RecurrentLayerInput(
+            recurrent_name=name, index=-1, parent_layers={})
+
+    def __real_step__(*args):
+        rnn_input = list(args)
+        static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
+        for static_input in static_inputs:
+            mem_name = "__%s_memory__" % static_input.input.name
+            mem = memory(
+                name=mem_name,
+                extra_input=extra_input,
+                is_seq=static_input.is_seq,
+                size=static_input.input.calculate_size,
+                boot_layer=static_input.input)
+            with mixed(
+                    name=mem_name,
+                    size=static_input.input.calculate_size,
+                    act=activation.Identity()) as mix:
+                mix += identity_projection(input=mem)
+            rnn_input.insert(input.index(static_input), mix)
+        return step(*rnn_input)
+
+    actual_output = __real_step__(*actual_input)
+
+    if not isinstance(actual_output, collections.Sequence):
+        actual_output = [actual_output]
+
+    retv = [
+        RecurrentLayerOutput(
+            recurrent_name=name,
+            index=i,
+            parent_layers={'recurrent_outputs': actual_output})
+        for i in xrange(len(actual_output))
+    ]
+    if len(retv) == 1:
+        return retv[0]
+    else:
+        return retv
+
+
+recurrent_group.__doc__ = conf_helps.recurrent_group.__doc__
+
+
+@wrap_name_default()
+def beam_search(step,
+                input,
+                bos_id,
+                eos_id,
+                beam_size,
+                max_length=500,
+                name=None,
+                num_results_per_sample=None):
+    if num_results_per_sample is None:
+        num_results_per_sample = beam_size
+    assert num_results_per_sample <= beam_size
+    # logger.warning("num_results_per_sample should be less than beam_size")
+
+    if isinstance(input, StaticInputV2) or isinstance(input,
+                                                      BaseGeneratedInputV2):
+        input = [input]
+
+    generated_input_index = -1
+
+    real_input = []
+    for i, each_input in enumerate(input):
+        assert isinstance(each_input, StaticInputV2) or isinstance(
+            each_input, BaseGeneratedInputV2)
+        if isinstance(each_input, BaseGeneratedInputV2):
+            assert generated_input_index == -1
+            generated_input_index = i
+        else:
+            real_input.append(each_input)
+
+    assert generated_input_index != -1
+
+    gipt = input[generated_input_index]
+    assert isinstance(gipt, BaseGeneratedInputV2)
+
+    gipt.bos_id = bos_id
+    gipt.eos_id = eos_id
+
+    def __real_step__(*args):
+        eos_name = "__%s_eos_layer__" % name
+        generator = RecurrentLayerGroupSetGeneratorV2(
+            eos_name, max_length, beam_size, num_results_per_sample)
+
+        args = list(args)
+        before_step_layer = gipt.before_real_step()
+        before_step_layer.append_child(
+            layer=generator, parent_names=[before_step_layer.name])
+        args.insert(generated_input_index, before_step_layer)
+
+        predict = gipt.after_real_step(step(*args))
+
+        eos_layer = eos(input=predict, eos_id=eos_id, name=eos_name)
+        predict.append_child(layer=eos_layer, parent_names=[predict.name])
+
+        return predict
+
+    # tmp = paddle.layer.recurrent_group(
+    #     step=__real_step__,
+    #     input=real_input,
+    #     reverse=False,
+    #     name=name,
+    #     is_generating=True)
+    tmp = recurrent_group(step=__real_step__, input=real_input, name=name)
+
+    return tmp
+
+
+beam_search.__doc__ = conf_helps.beam_search.__doc__
+
+__projection_names__ = filter(lambda x: x.endswith('_projection'),
+                              dir(conf_helps))
+
+__all__ += __projection_names__
+
+__operator_names__ = filter(lambda x: x.endswith('_operator'), dir(conf_helps))
+__all__ += __operator_names__
+
+# convert projection
+for prj in __projection_names__:
+    globals()[prj] = __convert_to_v2__(
+        prj, parent_names=['input'], is_default_name=False)
+    globals()[prj].__name__ = prj
+
+# convert operator
+operator_list = [
+    # [V1_method_name, parent_names],
+    ['dotmul_operator', ['a', 'b']],
+    ['conv_operator', ['img', 'filter']]
+]
+for op in operator_list:
+    globals()[op[0]] = __convert_to_v2__(
+        op[0], parent_names=op[1], is_default_name=False)
+    globals()[op[0]].__name__ = op[0]
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb
--- /dev/null
+++ b/python/paddle/v2/minibatch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e6644196c8242cc3fed7a4fb1503697e5b59ffb
--- /dev/null
+++ b/python/paddle/v2/networks.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.networks as conf_nw
+import inspect
+from config_base import __convert_to_v2__
+
+__all__ = []
+
+
+def __initialize__():
+    for each_subnetwork in conf_nw.__all__:
+        if each_subnetwork in ['inputs', 'outputs']:
+            continue
+        func = getattr(conf_nw, each_subnetwork)
+        if hasattr(func, 'argspec'):
+            argspec = func.argspec
+        else:
+            argspec = inspect.getargspec(func)
+        if each_subnetwork == 'simple_attention':
+            parents = ['encoded_sequence', 'encoded_proj', 'decoder_state']
+        else:
+            parents = filter(lambda x: x.startswith('input'), argspec.args)
+        assert len(parents) != 0, each_subnetwork
+        v2_subnet = __convert_to_v2__(
+            each_subnetwork,
+            parent_names=parents,
+            is_default_name='name' in argspec.args)
+        globals()[each_subnetwork] = v2_subnet
+        globals()[each_subnetwork].__name__ = each_subnetwork
+        global __all__
+        __all__.append(each_subnetwork)
+
+
+__initialize__()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e99d4a241b7fe2b0f9ff4ba191db4b341c4d30e
--- /dev/null
+++ b/python/paddle/v2/optimizer.py
@@ -0,0 +1,266 @@
+import py_paddle.swig_paddle as swig_api
+
+import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+"""
+Optimizers(update equation) for SGD method.
+
+TODO(yuyang18): Complete comments.
+"""
+
+__all__ = [
+    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
+    'RMSProp', 'ModelAverage', 'L2Regularization'
+]
+
+
+class Optimizer(object):
+    def __init__(self, **kwargs):
+        if 'batch_size' in kwargs:
+            del kwargs['batch_size']  # not important for python library.
+
+        def __impl__():
+            v1_optimizers.settings(batch_size=1, **kwargs)
+
+        self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
+            __impl__)
+        self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
+            self.__opt_conf_proto__)
+
+    def enable_types(self):
+        """
+        get enable_types for each optimizer.
+        enable_types = [value, gradient, momentum, etc]
+        For each optimizer(SGD, Adam), GradientMachine should enable different
+        buffers.
+        """
+        tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
+        assert isinstance(tmp, swig_api.ParameterOptimizer)
+        return tmp.getParameterTypes()
+
+    def __create_local_updater__(self):
+        return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
+
+    def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        return swig_api.ParameterUpdater.createRemoteUpdater(
+            self.__opt_conf__, pass_num, use_sparse_updater)
+
+    def create_updater(self, is_local, num_passes, use_sparse_updater):
+        """
+        create proper parameter_updater by configuration.
+        :param is_local: create local or remote parameter updater
+        :param num_passes: remote parameter updater will use this to config
+        parameter server.
+        :param use_sparse_updater: when use remote updater, if some parameter is
+        sparse, updater should do some extra thing:
+
+        ..  code-block:: python
+
+            if use_sparse_remote_updater:
+                        gradient_machine.prefetch(in_args)
+                        parameter_updater.getParametersRemote()
+        :return: parameter_updater
+        """
+        if is_local:
+            parameter_updater = self.__create_local_updater__()
+        else:
+            parameter_updater = self.__create_remote_updater__(
+                num_passes, use_sparse_updater)
+        return parameter_updater
+
+
+class Momentum(Optimizer):
+    """
+    SGD Optimizer.
+
+    SGD is an optimization method, trying to find a neural network that
+    minimize the "cost/error" of it by iteration. In paddle's implementation
+    SGD Optimizer is synchronized, which means all gradients will be wait to
+    calculate and reduced into one gradient, then do optimize operation.
+
+    The neural network consider the learning problem of minimizing an objective
+    function, that has the form of a sum
+
+    ..  math::
+
+        Q(w) = \\sum_{i}^{n} Q_i(w)
+
+    The value of function Q sometimes is the cost of neural network (Mean
+    Square Error between prediction and label for example). The function Q is
+    parametrised by w, the weight/bias of neural network. And weights is what to
+    be learned. The i is the i-th observation in (trainning) data.
+
+    So, the SGD method will optimize the weight by
+
+    ..  math::
+
+        w = w - \\eta \\nabla Q(w) = w - \\eta \\sum_{i}^{n} \\nabla Q_i(w)
+
+    where :math:`\\eta` is learning rate. And :math:`n` is batch size.
+    """
+
+    def __init__(self, momentum=None, sparse=False, **kwargs):
+        learning_method = v1_optimizers.MomentumOptimizer(
+            momentum=momentum, sparse=sparse)
+        super(Momentum, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class Adam(Optimizer):
+    """
+    Adam optimizer.
+    The details of please refer `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m(w, t) & = \\beta_1 m(w, t-1) + (1 - \\beta_1) \\nabla Q_i(w) \\\\
+        v(w, t) & = \\beta_2 v(w, t-1) + (1 - \\beta_2)(\\nabla Q_i(w)) ^2 \\\\
+        w & = w - \\frac{\\eta}{\\sqrt{v(w,t) + \\epsilon}}
+
+    :param beta1: the :math:`\\beta_1` in equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in equation.
+    :type beta2: float
+    :param epsilon: the :math:`\\epsilon` in equation. It is used to prevent
+                        divided by zero.
+    :type epsilon: float
+    """
+
+    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
+        learning_method = v1_optimizers.AdamOptimizer(
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+        super(Adam, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class Adamax(Optimizer):
+    """
+    Adamax optimizer.
+
+    The details of please refer this `Adam: A Method for Stochastic Optimization
+    <https://arxiv.org/abs/1412.6980>`_
+
+    ..  math::
+
+        m_t & = \\beta_1 * m_{t-1} + (1-\\beta_1)* \\nabla Q_i(w) \\\\
+        u_t & = max(\\beta_2*u_{t-1}, abs(\\nabla Q_i(w))) \\\\
+        w_t & = w_{t-1} - (\\eta/(1-\\beta_1^t))*m_t/u_t
+
+    :param beta1: the :math:`\\beta_1` in the equation.
+    :type beta1: float
+    :param beta2: the :math:`\\beta_2` in the equation.
+    :type beta2: float
+    """
+
+    def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
+        learning_method = v1_optimizers.AdamaxOptimizer(
+            beta1=beta1, beta2=beta2)
+        super(Adamax, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class AdaGrad(Optimizer):
+    """
+    Adagrad(for ADAptive GRAdient algorithm) optimizer.
+
+    For details please refer this `Adaptive Subgradient Methods for
+    Online Learning and Stochastic Optimization
+    <http://www.magicbroom.info/Papers/DuchiHaSi10.pdf>`_.
+
+    ..  math::
+
+        G &= \\sum_{\\tau=1}^{t} g_{\\tau} g_{\\tau}^T \\\\
+        w & = w - \\eta diag(G)^{-\\frac{1}{2}} \\circ g
+    """
+
+    def __init__(self, **kwargs):
+        learning_method = v1_optimizers.AdaGradOptimizer()
+        super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class DecayedAdaGrad(Optimizer):
+    """
+    AdaGrad method with decayed sum gradients. The equations of this method
+    show as follow.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= 1/sqrt( ( E(g_t^2) + \\epsilon )
+
+    :param rho: The :math:`\\rho` parameter in that equation
+    :type rho: float
+    :param epsilon: The :math:`\\epsilon` parameter in that equation.
+    :type epsilon: float
+    """
+
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.DecayedAdaGradOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(DecayedAdaGrad, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class AdaDelta(Optimizer):
+    """
+    AdaDelta method. The details of adadelta please refer to this
+    `ADADELTA: AN ADAPTIVE LEARNING RATE METHOD
+    <http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf>`_.
+
+    ..  math::
+
+        E(g_t^2) &= \\rho * E(g_{t-1}^2) + (1-\\rho) * g^2 \\\\
+        learning\\_rate &= sqrt( ( E(dx_{t-1}^2) + \\epsilon ) / ( \\
+                          E(g_t^2) + \\epsilon ) ) \\\\
+        E(dx_t^2) &= \\rho * E(dx_{t-1}^2) + (1-\\rho) * (-g*learning\\_rate)^2
+
+    :param rho: :math:`\\rho` in equation
+    :type rho: float
+    :param epsilon: :math:`\\rho` in equation
+    :type epsilon: float
+    """
+
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.AdaDeltaOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(AdaDelta, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class RMSProp(Optimizer):
+    """
+    RMSProp(for Root Mean Square Propagation) optimizer. For details please
+    refer this `slide <http://www.cs.toronto.edu/~tijmen/csc321/slides/
+    lecture_slides_lec6.pdf>`_.
+
+    The equations of this method as follows:
+
+    ..  math::
+
+        v(w, t) & = \\rho v(w, t-1) + (1 - \\rho)(\\nabla Q_{i}(w))^2 \\\\
+        w & = w - \\frac{\\eta} {\\sqrt{v(w,t) + \\epsilon}} \\nabla Q_{i}(w)
+
+    :param rho: the :math:`\\rho` in the equation. The forgetting factor.
+    :type rho: float
+    :param epsilon: the :math:`\\epsilon` in the equation.
+    :type epsilon: float
+    """
+
+    def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
+        learning_method = v1_optimizers.RMSPropOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(RMSProp, self).__init__(learning_method=learning_method, **kwargs)
+
+
+ModelAverage = v1_optimizers.ModelAverage
+L2Regularization = v1_optimizers.L2Regularization
+
+if __name__ == '__main__':
+    swig_api.initPaddle('--use_gpu=false')
+    for opt in [
+            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
+            AdaDelta(), RMSProp(), Adam(
+                model_average=ModelAverage(average_window=0.5),
+                regularization=L2Regularization(rate=0.5),
+                gradient_clipping_threshold=25)
+    ]:
+        print opt, opt.enable_types()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..64805d0c504b876f4d1f6657fe94457534a0b278
--- /dev/null
+++ b/python/paddle/v2/parameters.py
@@ -0,0 +1,337 @@
+import numpy as np
+import py_paddle.swig_paddle as api
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import struct
+import tarfile
+import cStringIO
+from topology import Topology
+
+__all__ = ['Parameters', 'create']
+
+
+def create(layers):
+    """
+    Create parameter pool by topology.
+
+    :param layers:
+    :return:
+    """
+    topology = Topology(layers)
+    pool = Parameters()
+    for param in topology.proto().parameters:
+        pool.__append_config__(param)
+    return pool
+
+
+class Parameters(object):
+    """
+    Parameters is a dictionary contains Paddle's parameter. The key of
+    Parameters is the name of parameter. The value of Parameters is a plain
+    :code:`numpy.ndarry` .
+
+    Basically usage is
+
+    ..  code-block:: python
+
+        data = paddle.layers.data(...)
+        ...
+        out = paddle.layers.fc(...)
+
+        parameters = paddle.parameters.create(out)
+
+        parameter_names = parameters.names()
+        fc_mat = parameters.get('fc')
+        print fc_mat
+    """
+
+    def __init__(self):
+        self.__param_conf__ = dict()
+        self.__gradient_machines__ = []
+        self.__tmp_params__ = []
+
+    def __append_config__(self, param_conf):
+        """
+        Append a parameter configuration. It used to initialize Parameters and
+        should be invoked only in paddle.parameters.create
+
+        :param param_conf: The parameter configuration in protobuf
+        :type param_conf: ParameterConfig
+        :return: Nothing
+        """
+
+        if not isinstance(param_conf, ParameterConfig):
+            raise ValueError("param_conf must be paddle.proto.ParameterConfig")
+
+        if param_conf.name in self.__param_conf__:
+            raise ValueError("duplicated parameter %s" % param_conf.name)
+
+        self.__param_conf__[param_conf.name] = param_conf
+
+    def keys(self):
+        """
+        keys are the names of each parameter.
+
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.__param_conf__.keys()
+
+    def names(self):
+        """
+        names of each parameter.
+
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.keys()
+
+    def has_key(self, key):
+        """
+        has_key return true if there are such parameter name == key
+
+        :param key: Parameter name
+        :type key: basestring
+        :return: True if contains such key
+        """
+        return key in self.__param_conf__.keys()
+
+    def __iter__(self):
+        """
+        Return an iterator of parameter name. It is used by `for loop`
+        or `in` operator.
+
+        ..  code-block:: python
+
+            parameters = paddle.parameters.create(...)
+            if "fc_param" in parameters:
+                print 'OK'
+        :return: an iterator of parameter name
+        :rtype: iterator
+        """
+        return iter(self.__param_conf__)
+
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        shape = self.get_shape(key)
+
+        if len(self.__gradient_machines__) == 0:
+            # create new parameter in python numpy.
+            if len(self.__tmp_params__) != 0:
+                ret_list = [
+                    mat for name, mat in self.__tmp_params__ if name == key
+                ]
+                if len(ret_list) == 1:
+                    return ret_list[0]
+            return np.ndarray(shape=shape, dtype=np.float32)
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                param = __get_parameter_in_gradient_machine__(
+                    each_gradient_machine, key)
+                # for simplify implementation now, we always copy from C++
+                assert isinstance(param, api.Parameter)
+                val = param.getBuf(api.PARAMETER_VALUE)
+                assert isinstance(val, api.Vector)
+                val = val.copyToNumpyArray()
+                return val
+                # else continue
+
+            raise RuntimeError("Unexpected branch")
+
+    def get_shape(self, key):
+        """
+        get shape of the parameter.
+
+        :param key: parameter name
+        :type key: basestring
+        :return: parameter's shape
+        :rtype: tuple
+        """
+        if not isinstance(key, basestring):
+            raise ValueError("parameter name should be string")
+        if not self.has_key(key):
+            raise ValueError("No such parameter %s" % key)
+        conf = self.__param_conf__[key]
+        dims = conf.dims if conf.dims else (1, conf.size)
+        return tuple(map(int, dims))
+
+    def __setitem__(self, key, value):
+        """
+        Set parameter by parameter name & value. It use Python dict syntax.
+
+        :note: It will always copy the parameter to C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :param value: Parameter matrix.
+        :type value: np.ndarray
+        :return: Nothing
+        """
+
+        if not isinstance(value, np.ndarray):
+            raise ValueError("Must return ndarray")
+        value = value.astype(dtype=np.float32)
+        shape = self.get_shape(key)
+        if value.shape != shape:
+            raise ValueError("Value shape mismatch, expect %s, should %s" %
+                             (shape, value.shape))
+
+        if len(self.__gradient_machines__) == 0:
+            self.__tmp_params__.append((key, value))
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                __copy_parameter_to_gradient_machine__(each_gradient_machine,
+                                                       key, value)
+
+    def get(self, parameter_name):
+        """
+        Get parameter by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :return: The parameter matrix.
+        :rtype: np.ndarray
+        """
+        return self.__getitem__(key=parameter_name)
+
+    def set(self, parameter_name, value):
+        """
+        Set parameter by parameter name & matrix.
+
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :param value: parameter matrix
+        :type value: np.ndarray
+        :return: Nothing.
+        """
+        self.__setitem__(key=parameter_name, value=value)
+
+    def append_gradient_machine(self, gradient_machine):
+        """
+        append gradient machine to parameters. This method is used internally in
+        Trainer.train.
+
+        :param gradient_machine: Paddle C++ GradientMachine object.
+        :type gradient_machine: api.GradientMachine
+        :return:
+        """
+
+        if not isinstance(gradient_machine, api.GradientMachine):
+            raise ValueError("gradient_machine should be api.GradientMachine")
+
+        if len(self.__tmp_params__) != 0:
+            for name, val in self.__tmp_params__:
+                try:
+                    __copy_parameter_to_gradient_machine__(gradient_machine,
+                                                           name, val)
+                except ValueError:
+                    # If no such parameter in gradient machine, then don't copy
+                    pass
+
+        self.__gradient_machines__.append(gradient_machine)
+
+    def serialize(self, name, f):
+        """
+
+        :param name:
+        :param f:
+        :type f: file
+        :return:
+        """
+        param = self.get(name)
+        size = reduce(lambda a, b: a * b, param.shape)
+        f.write(struct.pack("IIQ", 0, 4, size))
+        param = param.astype(np.float32)
+        f.write(param.tostring())
+
+    def deserialize(self, name, f):
+        """
+
+        :param name:
+        :param f:
+        :type f: file
+        :return:
+        """
+        f.read(16)  # header
+        arr = np.frombuffer(f.read(), dtype=np.float32)
+        self.set(name, arr.reshape(self.get_shape(name)))
+
+    def to_tar(self, f):
+        tar = tarfile.TarFile(fileobj=f, mode='w')
+        for nm in self.names():
+            buf = cStringIO.StringIO()
+            self.serialize(nm, buf)
+            tarinfo = tarfile.TarInfo(name=nm)
+            buf.seek(0)
+            tarinfo.size = len(buf.getvalue())
+            tar.addfile(tarinfo, buf)
+
+            conf = self.__param_conf__[nm]
+            confStr = conf.SerializeToString()
+            tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm)
+            tarinfo.size = len(confStr)
+            buf = cStringIO.StringIO(confStr)
+            buf.seek(0)
+            tar.addfile(tarinfo, fileobj=buf)
+
+    @staticmethod
+    def from_tar(f):
+        params = Parameters()
+        tar = tarfile.TarFile(fileobj=f, mode='r')
+        for finfo in tar:
+            assert isinstance(finfo, tarfile.TarInfo)
+            if finfo.name.endswith('.protobuf'):
+                f = tar.extractfile(finfo)
+                conf = ParameterConfig()
+                conf.ParseFromString(f.read())
+                params.__append_config__(conf)
+
+        for param_name in params.names():
+            f = tar.extractfile(param_name)
+            params.deserialize(param_name, f)
+        return params
+
+
+def __get_parameter_in_gradient_machine__(gradient_machine, name):
+    """
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :return:
+    :rtype: api.Parameter
+    """
+    params = filter(lambda p: p.getName() == name,
+                    gradient_machine.getParameters())
+
+    if len(params) == 0:
+        raise ValueError("No such parameter")
+    elif len(params) > 1:
+        raise ValueError("Unexpected branch")
+    else:
+        return params[0]
+
+
+def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
+    """
+    Copy a python ndarray into the gradient machine.
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :param arr:
+    :type arr: np.ndarray
+    :return:
+    :rtype: api.Parameter
+    """
+    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
+    vec = param.getBuf(api.PARAMETER_VALUE)
+    assert isinstance(vec, api.Vector)
+    vec.copyFromNumpyArray(arr.flatten())
diff --git a/python/paddle/v2/plot/__init__.py b/python/paddle/v2/plot/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd3013db4e6a57cd1b269266bea82a31e928397
--- /dev/null
+++ b/python/paddle/v2/plot/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from plot import Ploter
+
+__all__ = ['Ploter']
diff --git a/python/paddle/v2/plot/plot.py b/python/paddle/v2/plot/plot.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f7bd039b07db4832295c2374293bffa588eb4ef
--- /dev/null
+++ b/python/paddle/v2/plot/plot.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+
+class PlotData(object):
+    def __init__(self):
+        self.step = []
+        self.value = []
+
+    def append(self, step, value):
+        self.step.append(step)
+        self.value.append(value)
+
+    def reset(self):
+        self.step = []
+        self.value = []
+
+
+class Ploter(object):
+    def __init__(self, *args):
+        self.__args__ = args
+        self.__plot_data__ = {}
+        for title in args:
+            self.__plot_data__[title] = PlotData()
+        # demo in notebooks will use Ploter to plot figure, but when we convert
+        # the ipydb to py file for testing, the import of matplotlib will make the
+        # script crash. So we can use `export DISABLE_PLOT=True` to disable import
+        # these libs
+        self.__disable_plot__ = os.environ.get("DISABLE_PLOT")
+        if not self.__plot_is_disabled__():
+            import matplotlib.pyplot as plt
+            from IPython import display
+            self.plt = plt
+            self.display = display
+
+    def __plot_is_disabled__(self):
+        return self.__disable_plot__ == "True"
+
+    def append(self, title, step, value):
+        assert isinstance(title, basestring)
+        assert self.__plot_data__.has_key(title)
+        data = self.__plot_data__[title]
+        assert isinstance(data, PlotData)
+        data.append(step, value)
+
+    def plot(self):
+        if self.__plot_is_disabled__():
+            return
+
+        titles = []
+        for title in self.__args__:
+            data = self.__plot_data__[title]
+            assert isinstance(data, PlotData)
+            if len(data.step) > 0:
+                titles.append(title)
+                self.plt.plot(data.step, data.value)
+        self.plt.legend(titles, loc='upper left')
+        self.display.clear_output(wait=True)
+        self.display.display(self.plt.gcf())
+        self.plt.gcf().clear()
+
+    def reset(self):
+        for key in self.__plot_data__:
+            data = self.__plot_data__[key]
+            assert isinstance(data, PlotData)
+            data.reset()
diff --git a/python/paddle/v2/plot/tests/CMakeLists.txt b/python/paddle/v2/plot/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b1132f131737e26bfeeb31f6b3f062710bdf6f75
--- /dev/null
+++ b/python/paddle/v2/plot/tests/CMakeLists.txt
@@ -0,0 +1 @@
+add_python_test(test_ploter test_ploter.py)
diff --git a/python/paddle/v2/plot/tests/__init__.py b/python/paddle/v2/plot/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1abfc08f19505a9010e924e34074e5bc3cc0571
--- /dev/null
+++ b/python/paddle/v2/plot/tests/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import test_ploter
+
+__all__ = ['test_ploter.py']
diff --git a/python/paddle/v2/plot/tests/test_ploter.py b/python/paddle/v2/plot/tests/test_ploter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a75f853ed933dfce651faf758f71feca7cd8d328
--- /dev/null
+++ b/python/paddle/v2/plot/tests/test_ploter.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+from paddle.v2.plot import Ploter
+
+
+class TestCommon(unittest.TestCase):
+    def test_append(self):
+        title1 = "title1"
+        title2 = "title2"
+        plot_test = Ploter(title1, title2)
+        plot_test.append(title1, 1, 2)
+        plot_test.append(title1, 2, 5)
+        plot_test.append(title2, 3, 4)
+        self.assertEqual(plot_test.__plot_data__[title1].step, [1, 2])
+        self.assertEqual(plot_test.__plot_data__[title1].value, [2, 5])
+        self.assertEqual(plot_test.__plot_data__[title2].step, [3])
+        self.assertEqual(plot_test.__plot_data__[title2].value, [4])
+        plot_test.reset()
+        self.assertEqual(plot_test.__plot_data__[title1].step, [])
+        self.assertEqual(plot_test.__plot_data__[title1].value, [])
+        self.assertEqual(plot_test.__plot_data__[title2].step, [])
+        self.assertEqual(plot_test.__plot_data__[title2].value, [])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/pooling.py b/python/paddle/v2/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..4881c27d1d6d3d926f12aab096f377164debf1ef
--- /dev/null
+++ b/python/paddle/v2/pooling.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.poolings
+import copy
+
+__all__ = []
+suffix = 'Pooling'
+
+for name in paddle.trainer_config_helpers.poolings.__all__:
+    new_name = name[:-len(suffix)]
+    globals()[new_name] = copy.copy(
+        getattr(paddle.trainer_config_helpers.poolings, name))
+    globals()[new_name].__name__ = new_name
+    __all__.append(new_name)
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/v2/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..07142056f872db5113acdd296b17c52b343c1be6
--- /dev/null
+++ b/python/paddle/v2/reader/creator.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could be used in user
+program.
+"""
+
+__all__ = ['np_array', 'text_file']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..104ce9a0411413bb8fc65eedf5821f98d6acdba3
--- /dev/null
+++ b/python/paddle/v2/reader/decorator.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn'
+]
+
+import itertools
+import random
+from Queue import Queue
+from threading import Thread
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+    
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+    
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6a1d337b232c7a849a8793894bf16d26d609d3dd
--- /dev/null
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -0,0 +1 @@
+add_python_test(reader_tests creator_test.py decorator_test.py)
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8d7133b8694aae5541eff9576eaba8a31e77dc
--- /dev/null
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -0,0 +1,40 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+
+import numpy as np
+
+import paddle.v2.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.v2.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.v2.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..734154b9790a4dc118d11992343648364c907305
--- /dev/null
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -0,0 +1,125 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import unittest
+
+import paddle.v2.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.v2.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.v2.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.v2.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..eb02e53706b4834eb9dc75d0e3a809772b124725
--- /dev/null
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_python_test(test_v2_api test_data_feeder.py test_parameters.py
+test_layer.py test_rnn_layer.py test_topology.py test_image.py)
diff --git a/python/paddle/v2/tests/cat.jpg b/python/paddle/v2/tests/cat.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..bc1fbbd371216b9904b522ed302700c79d2e4876
Binary files /dev/null and b/python/paddle/v2/tests/cat.jpg differ
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..71eb3bf31425c22b47accc11c9550042e077ef12
--- /dev/null
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import py_paddle.swig_paddle as api
+import numpy as np
+
+from paddle.v2 import data_type
+from paddle.v2.data_feeder import DataFeeder
+
+
+class DataFeederTest(unittest.TestCase):
+    def dense_reader(self, size):
+        data = np.random.random(size)
+        return data
+
+    def sparse_binary_reader(self, high, size_limit, non_empty=False):
+        num = np.random.randint(size_limit)  # num could be 0
+        while non_empty and num == 0:
+            num = np.random.randint(size_limit)
+        return np.random.randint(high, size=num).tolist()
+
+    def test_dense(self):
+        def compare(input):
+            feeder = DataFeeder([('image', data_type.dense_vector(784))],
+                                {'image': 0})
+            arg = feeder(input)
+            output = arg.getSlotValue(0).copyToNumpyMat()
+            input = np.array(input, dtype='float32')
+            self.assertAlmostEqual(input.all(), output.all())
+
+        # test numpy array
+        batch_size = 32
+        dim = 784
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim))
+            data.append(each_sample)
+        compare(data)
+
+        # each feature is a list
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim).tolist())
+            data.append(each_sample)
+        compare(data)
+
+        # test tuple
+        data = []
+        for i in xrange(batch_size):
+            each_sample = (self.dense_reader(dim).tolist(), )
+            data.append(each_sample)
+        compare(data)
+
+    def test_sparse_binary(self):
+        dim = 10000
+        batch_size = 32
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.sparse_binary_reader(dim, 50))
+            data.append(each_sample)
+        feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), data[i][0])
+
+    def test_sparse(self):
+        dim = 10000
+        batch_size = 32
+        v = []
+        w = []
+        data = []
+        for dat in xrange(batch_size):
+            each_sample = []
+            a = self.sparse_binary_reader(dim, 40, non_empty=True)
+            b = self.dense_reader(len(a)).tolist()
+            v.append(a)
+            w.append(np.array(b, dtype="float32"))
+            each_sample.append(zip(a, b))
+            data.append(each_sample)
+
+        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), v[i])
+            cols_value = output.getSparseRowColsVal(i)
+            value = [val[1] for val in cols_value]
+            value = np.array(value, dtype="float32")
+            self.assertAlmostEqual(value.all(), w[i].all())
+
+    def test_integer(self):
+        value_range = 100
+        batch_size = 32
+        index = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(value_range))
+            index.append(each_sample)
+        feeder = DataFeeder([('input', data_type.integer_value(value_range))],
+                            {'input': 0})
+        arg = feeder(index)
+        output = arg.getSlotIds(0).copyToNumpyArray()
+        index = np.array(index, dtype='int')
+        self.assertEqual(output.all(), index.flatten().all())
+
+    def test_integer_sequence(self):
+        value_range = 10000
+        batch_size = 32
+        start = [0]
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(
+                self.sparse_binary_reader(
+                    value_range, 30, non_empty=True))
+            data.append(each_sample)
+            start.append(len(each_sample[0]) + start[-1])
+        feeder = DataFeeder(
+            [('input', data_type.integer_value_sequence(value_range))],
+            {'input': 0})
+        arg = feeder(data)
+        output_data = arg.getSlotIds(0).copyToNumpyArray()
+        output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray()
+
+        index = []
+        for dat in data:
+            index.extend(x for x in dat[0])  # only one feature, so dat[0]
+        index = np.array(index, dtype='int')
+        start = np.array(start, dtype='int')
+        self.assertEqual(output_data.all(), index.all())
+        self.assertEqual(output_start.all(), start.all())
+
+    def test_multiple_features(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(10))
+            each_sample.append(
+                self.sparse_binary_reader(
+                    20000, 40, non_empty=True))
+            each_sample.append(self.dense_reader(100))
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_sparse = arg.getSlotValue(1)
+        output_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 features, but only use 2 features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_index = arg.getSlotIds(1).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 featreus, one is duplicate data
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10)),
+                      ('fea3', data_type.dense_vector(100))]
+        feeder = DataFeeder(data_types,
+                            {'fea0': 2,
+                             'fea1': 1,
+                             'fea2': 0,
+                             'fea3': 2})
+        arg = feeder(data)
+        fea0 = arg.getSlotValue(0).copyToNumpyMat()
+        fea1 = arg.getSlotValue(1)
+        fea2 = arg.getSlotIds(2).copyToNumpyArray()
+        fea3 = arg.getSlotValue(3).copyToNumpyMat()
+        for i in xrange(batch_size):
+            self.assertEqual(fea0[i].all(), data[i][2].all())
+            self.assertEqual(fea1.getSparseRowCols(i), data[i][1])
+            self.assertEqual(fea2[i], data[i][0])
+            self.assertEqual(fea3[i].all(), data[i][2].all())
+
+    def test_multiple_features_tuple(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            a = np.random.randint(10)
+            b = self.sparse_binary_reader(20000, 40, non_empty=True)
+            c = self.dense_reader(100)
+            each_sample = (a, b, c)
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        out_dense = arg.getSlotValue(0).copyToNumpyMat()
+        out_sparse = arg.getSlotValue(1)
+        out_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(out_dense[i].all(), data[i][2].all())
+            self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(out_index[i], data[i][0])
+
+
+if __name__ == '__main__':
+    api.initPaddle("--use_gpu=0")
+    suite = unittest.TestLoader().loadTestsFromTestCase(DataFeederTest)
+    unittest.TextTestRunner().run(suite)
+    if api.isGpuVersion():
+        api.setUseGpu(True)
+        unittest.main()
diff --git a/python/paddle/v2/tests/test_image.py b/python/paddle/v2/tests/test_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2d773510de28ca2614e95b465c73b82aa7b0463
--- /dev/null
+++ b/python/paddle/v2/tests/test_image.py
@@ -0,0 +1,42 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+
+import paddle.v2.image as image
+
+
+class Image(unittest.TestCase):
+    def test_resize_flip_chw(self):
+        # resize
+        im = image.load_image('cat.jpg')
+        im = image.resize_short(im, 256)
+        self.assertEqual(256, min(im.shape[:2]))
+        self.assertEqual(3, im.shape[2])
+
+        # flip
+        im = image.left_right_flip(im)
+        im2 = np.flip(im, 1)
+        self.assertEqual(im.all(), im2.all())
+
+        # to_chw
+        h, w, c = im.shape
+        im = image.to_chw(im)
+        self.assertEqual(c, im.shape[0])
+        self.assertEqual(h, im.shape[1])
+        self.assertEqual(w, im.shape[2])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c67f3b84d96eb92d94ad80cc54c5e056103c1a1a
--- /dev/null
+++ b/python/paddle/v2/tests/test_layer.py
@@ -0,0 +1,286 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import paddle.v2.activation as activation
+import paddle.v2.attr as attr
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+import paddle.v2.pooling as pooling
+import paddle.v2.networks as networks
+import paddle.v2.evaluator as evaluator
+
+pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
+label = layer.data(name='label', type=data_type.integer_value(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(1))
+combine_weight = layer.data(
+    name='weight_combine', type=data_type.dense_vector(10))
+score = layer.data(name='score', type=data_type.dense_vector(1))
+
+hidden = layer.fc(input=pixel,
+                  size=100,
+                  act=activation.Sigmoid(),
+                  param_attr=attr.Param(name='hidden'))
+inference = layer.fc(input=hidden, size=10, act=activation.Softmax())
+conv = layer.img_conv(
+    input=pixel,
+    filter_size=1,
+    filter_size_y=1,
+    num_channels=8,
+    num_filters=16,
+    act=activation.Linear())
+
+
+class ImageLayerTest(unittest.TestCase):
+    def test_conv_layer(self):
+        conv_shift = layer.conv_shift(a=pixel, b=score)
+        print layer.parse_network(conv, conv_shift)
+
+    def test_pooling_layer(self):
+        maxpool = layer.img_pool(
+            input=conv,
+            pool_size=2,
+            num_channels=16,
+            padding=1,
+            pool_type=pooling.Max())
+        spp = layer.spp(input=conv,
+                        pyramid_height=2,
+                        num_channels=16,
+                        pool_type=pooling.Max())
+        maxout = layer.maxout(input=conv, num_channels=16, groups=4)
+        print layer.parse_network([maxpool, spp, maxout])
+
+    def test_norm_layer(self):
+        norm1 = layer.img_cmrnorm(input=conv, size=5)
+        norm2 = layer.batch_norm(input=conv)
+        norm3 = layer.sum_to_one_norm(input=conv)
+        print layer.parse_network([norm1, norm2, norm3])
+
+
+class AggregateLayerTest(unittest.TestCase):
+    def test_aggregate_layer(self):
+        pool = layer.pooling(
+            input=pixel,
+            pooling_type=pooling.Avg(),
+            agg_level=layer.AggregateLevel.EACH_SEQUENCE)
+        last_seq = layer.last_seq(input=pixel)
+        first_seq = layer.first_seq(input=pixel)
+        concat = layer.concat(input=[last_seq, first_seq])
+        seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
+        print layer.parse_network(
+            [pool, last_seq, first_seq, concat, seq_concat])
+
+
+class MathLayerTest(unittest.TestCase):
+    def test_math_layer(self):
+        addto = layer.addto(input=[pixel, pixel])
+        linear_comb = layer.linear_comb(
+            weights=combine_weight, vectors=hidden, size=10)
+        interpolation = layer.interpolation(
+            input=[hidden, hidden], weight=score)
+        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
+        power = layer.power(input=pixel, weight=score)
+        scaling = layer.scaling(input=pixel, weight=score)
+        slope = layer.slope_intercept(input=pixel)
+        tensor = layer.tensor(a=pixel, b=pixel, size=1000)
+        cos_sim = layer.cos_sim(a=pixel, b=pixel)
+        trans = layer.trans(input=tensor)
+        print layer.parse_network([
+            addto, linear_comb, interpolation, power, scaling, slope, tensor,
+            cos_sim, trans
+        ])
+
+
+class ReshapeLayerTest(unittest.TestCase):
+    def test_reshape_layer(self):
+        block_expand = layer.block_expand(
+            input=conv, num_channels=4, stride_x=1, block_x=1)
+        expand = layer.expand(
+            input=weight,
+            expand_as=pixel,
+            expand_level=layer.ExpandLevel.FROM_TIMESTEP)
+        repeat = layer.repeat(input=pixel, num_repeats=4)
+        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
+        rotate = layer.rotate(input=pixel, height=16, width=49)
+        print layer.parse_network(
+            [block_expand, expand, repeat, reshape, rotate])
+
+
+class RecurrentLayerTest(unittest.TestCase):
+    def test_recurrent_layer(self):
+        word = layer.data(name='word', type=data_type.integer_value(12))
+        recurrent = layer.recurrent(input=word)
+        lstm = layer.lstmemory(input=word)
+        gru = layer.grumemory(input=word)
+        print layer.parse_network([recurrent, lstm, gru])
+
+
+class CostLayerTest(unittest.TestCase):
+    def test_cost_layer(self):
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.classification_cost(
+            input=inference, label=label, weight=weight)
+        cost3 = layer.cross_entropy_cost(input=inference, label=label)
+        cost4 = layer.cross_entropy_with_selfnorm_cost(
+            input=inference, label=label)
+        cost5 = layer.mse_cost(input=inference, label=label)
+        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
+        cost7 = layer.multi_binary_label_cross_entropy_cost(
+            input=inference, label=label)
+        cost8 = layer.rank_cost(left=score, right=score, label=score)
+        cost9 = layer.lambda_cost(input=inference, score=score)
+        cost10 = layer.sum_cost(input=inference)
+        cost11 = layer.huber_cost(input=score, label=label)
+
+        print layer.parse_network([cost1, cost2])
+        print layer.parse_network([cost3, cost4])
+        print layer.parse_network([cost5, cost6])
+        print layer.parse_network([cost7, cost8, cost9, cost10, cost11])
+
+        crf = layer.crf(input=inference, label=label)
+        crf_decoding = layer.crf_decoding(input=inference, size=3)
+        ctc = layer.ctc(input=inference, label=label)
+        warp_ctc = layer.warp_ctc(input=pixel, label=label)
+        nce = layer.nce(input=inference, label=label, num_classes=3)
+        hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
+
+        print layer.parse_network(
+            [crf, crf_decoding, ctc, warp_ctc, nce, hsigmoid])
+
+
+class OtherLayerTest(unittest.TestCase):
+    def test_sampling_layer(self):
+        maxid = layer.max_id(input=inference)
+        sampling_id = layer.sampling_id(input=inference)
+        eos = layer.eos(input=maxid, eos_id=5)
+        print layer.parse_network([maxid, sampling_id, eos])
+
+    def test_slicing_joining_layer(self):
+        pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+        print layer.parse_network(pad)
+
+
+class ProjOpTest(unittest.TestCase):
+    def test_projection(self):
+        input = layer.data(name='data', type=data_type.dense_vector(784))
+        word = layer.data(
+            name='word', type=data_type.integer_value_sequence(10000))
+        fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
+        mixed0 = layer.mixed(
+            size=256,
+            input=[
+                layer.full_matrix_projection(input=fc0),
+                layer.full_matrix_projection(input=fc1)
+            ])
+        with layer.mixed(size=200) as mixed1:
+            mixed1 += layer.full_matrix_projection(input=fc0)
+            mixed1 += layer.identity_projection(input=fc1)
+
+        table = layer.table_projection(input=word)
+        emb0 = layer.mixed(size=512, input=table)
+        with layer.mixed(size=512) as emb1:
+            emb1 += table
+
+        scale = layer.scaling_projection(input=fc0)
+        scale0 = layer.mixed(size=100, input=scale)
+        with layer.mixed(size=100) as scale1:
+            scale1 += scale
+
+        dotmul = layer.dotmul_projection(input=fc0)
+        dotmul0 = layer.mixed(size=100, input=dotmul)
+        with layer.mixed(size=100) as dotmul1:
+            dotmul1 += dotmul
+
+        context = layer.context_projection(input=fc0, context_len=5)
+        context0 = layer.mixed(size=100, input=context)
+        with layer.mixed(size=100) as context1:
+            context1 += context
+
+        conv = layer.conv_projection(
+            input=input,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv, bias_attr=True)
+        with layer.mixed(bias_attr=True) as conv1:
+            conv1 += conv
+
+        print layer.parse_network(mixed0)
+        print layer.parse_network(mixed1)
+        print layer.parse_network(emb0)
+        print layer.parse_network(emb1)
+        print layer.parse_network(scale0)
+        print layer.parse_network(scale1)
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+    def test_operator(self):
+        ipt0 = layer.data(name='data', type=data_type.dense_vector(784))
+        ipt1 = layer.data(name='word', type=data_type.dense_vector(128))
+        fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+
+        dotmul_op = layer.dotmul_operator(a=fc0, b=fc1)
+        dotmul0 = layer.mixed(input=dotmul_op)
+        with layer.mixed() as dotmul1:
+            dotmul1 += dotmul_op
+
+        conv = layer.conv_operator(
+            img=ipt0,
+            filter=ipt1,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv)
+        with layer.mixed() as conv1:
+            conv1 += conv
+
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+
+class NetworkTests(unittest.TestCase):
+    def test_vgg(self):
+        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        vgg_out = networks.small_vgg(
+            input_image=img, num_channels=1, num_classes=2)
+        print layer.parse_network(vgg_out)
+
+
+class EvaluatorTest(unittest.TestCase):
+    def test_evaluator(self):
+        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        output = layer.fc(input=img,
+                          size=10,
+                          act=activation.Softmax(),
+                          name='fc_here')
+        lbl = layer.data(name='label', type=data_type.integer_value(10))
+        cost = layer.cross_entropy_cost(input=output, label=lbl)
+
+        evaluator.classification_error(input=output, label=lbl)
+        print layer.parse_network(cost)
+        print layer.parse_network(output)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebb182caab6430862a8e4da2ae4ea6b1e72f726c
--- /dev/null
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -0,0 +1,60 @@
+import unittest
+import sys
+
+try:
+    import py_paddle
+
+    del py_paddle
+except ImportError:
+    print >> sys.stderr, "It seems swig of Paddle is not installed, this " \
+                         "unittest will not be run."
+    sys.exit(0)
+
+import paddle.v2.parameters as parameters
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import random
+import cStringIO
+import numpy
+
+
+def __rand_param_config__(name):
+    conf = ParameterConfig()
+    conf.name = name
+    size = 1
+    for i in xrange(2):
+        dim = random.randint(1, 1000)
+        conf.dims.append(dim)
+        size *= dim
+    conf.size = size
+    assert conf.IsInitialized()
+    return conf
+
+
+class TestParameters(unittest.TestCase):
+    def test_serialization(self):
+        params = parameters.Parameters()
+        params.__append_config__(__rand_param_config__("param_0"))
+        params.__append_config__(__rand_param_config__("param_1"))
+
+        for name in params.names():
+            param = params.get(name)
+            param[:] = numpy.random.uniform(
+                -1.0, 1.0, size=params.get_shape(name))
+            params.set(name, param)
+
+        tmp_file = cStringIO.StringIO()
+        params.to_tar(tmp_file)
+        tmp_file.seek(0)
+        params_dup = parameters.Parameters.from_tar(tmp_file)
+
+        self.assertEqual(params_dup.names(), params.names())
+
+        for name in params.names():
+            self.assertEqual(params.get_shape(name), params_dup.get_shape(name))
+            p0 = params.get(name)
+            p1 = params_dup.get(name)
+            self.assertTrue(numpy.isclose(p0, p1).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbbd20eb76bb9daab2bcf98c4adad989106a377
--- /dev/null
+++ b/python/paddle/v2/tests/test_rnn_layer.py
@@ -0,0 +1,155 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import difflib
+import unittest
+
+import paddle.trainer_config_helpers as conf_helps
+import paddle.v2.activation as activation
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as parse_network
+
+
+class RNNTest(unittest.TestCase):
+    def test_simple_rnn(self):
+        dict_dim = 10
+        word_dim = 8
+        hidden_dim = 8
+
+        def parse_old_rnn():
+            def step(y):
+                mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
+                out = conf_helps.fc_layer(
+                    input=[y, mem],
+                    size=hidden_dim,
+                    act=activation.Tanh(),
+                    bias_attr=True,
+                    name="rnn_state")
+                return out
+
+            def test():
+                data = conf_helps.data_layer(name="word", size=dict_dim)
+                embd = conf_helps.embedding_layer(input=data, size=word_dim)
+                conf_helps.recurrent_group(name="rnn", step=step, input=embd)
+
+            return str(parse_network(test))
+
+        def parse_new_rnn():
+            def new_step(y):
+                mem = layer.memory(name="rnn_state", size=hidden_dim)
+                out = layer.fc(input=[y, mem],
+                               size=hidden_dim,
+                               act=activation.Tanh(),
+                               bias_attr=True,
+                               name="rnn_state")
+                return out
+
+            data = layer.data(
+                name="word", type=data_type.integer_value(dict_dim))
+            embd = layer.embedding(input=data, size=word_dim)
+            rnn_layer = layer.recurrent_group(
+                name="rnn", step=new_step, input=embd)
+            return str(layer.parse_network(rnn_layer))
+
+        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
+                                    parse_new_rnn().splitlines(1))
+        print ''.join(diff)
+
+    def test_sequence_rnn_multi_input(self):
+        dict_dim = 10
+        word_dim = 8
+        hidden_dim = 8
+        label_dim = 3
+
+        def parse_old_rnn():
+            def test():
+                data = conf_helps.data_layer(name="word", size=dict_dim)
+                label = conf_helps.data_layer(name="label", size=label_dim)
+                emb = conf_helps.embedding_layer(input=data, size=word_dim)
+                boot_layer = conf_helps.data_layer(name="boot", size=10)
+                boot_layer = conf_helps.fc_layer(
+                    name='boot_fc', input=boot_layer, size=10)
+
+                def step(y, wid):
+                    z = conf_helps.embedding_layer(input=wid, size=word_dim)
+                    mem = conf_helps.memory(
+                        name="rnn_state",
+                        size=hidden_dim,
+                        boot_layer=boot_layer)
+                    out = conf_helps.fc_layer(
+                        input=[y, z, mem],
+                        size=hidden_dim,
+                        act=conf_helps.TanhActivation(),
+                        bias_attr=True,
+                        name="rnn_state")
+                    return out
+
+                out = conf_helps.recurrent_group(
+                    name="rnn", step=step, input=[emb, data])
+
+                rep = conf_helps.last_seq(input=out)
+                prob = conf_helps.fc_layer(
+                    size=label_dim,
+                    input=rep,
+                    act=conf_helps.SoftmaxActivation(),
+                    bias_attr=True)
+
+                conf_helps.outputs(
+                    conf_helps.classification_cost(
+                        input=prob, label=label))
+
+            return str(parse_network(test))
+
+        def parse_new_rnn():
+            data = layer.data(
+                name="word", type=data_type.dense_vector(dict_dim))
+            label = layer.data(
+                name="label", type=data_type.dense_vector(label_dim))
+            emb = layer.embedding(input=data, size=word_dim)
+            boot_layer = layer.data(
+                name="boot", type=data_type.dense_vector(10))
+            boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10)
+
+            def step(y, wid):
+                z = layer.embedding(input=wid, size=word_dim)
+                mem = layer.memory(
+                    name="rnn_state", size=hidden_dim, boot_layer=boot_layer)
+                out = layer.fc(input=[y, z, mem],
+                               size=hidden_dim,
+                               act=activation.Tanh(),
+                               bias_attr=True,
+                               name="rnn_state")
+                return out
+
+            out = layer.recurrent_group(
+                name="rnn", step=step, input=[emb, data])
+
+            rep = layer.last_seq(input=out)
+            prob = layer.fc(size=label_dim,
+                            input=rep,
+                            act=activation.Softmax(),
+                            bias_attr=True)
+
+            cost = layer.classification_cost(input=prob, label=label)
+
+            return str(layer.parse_network(cost))
+
+        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
+                                    parse_new_rnn().splitlines(1))
+        print ''.join(diff)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c6dbcdb4f49b960fb8b71aecbad4f013d2cd283
--- /dev/null
+++ b/python/paddle/v2/tests/test_topology.py
@@ -0,0 +1,84 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle.v2.layer as layer
+import paddle.v2.topology as topology
+import paddle.v2.data_type as data_type
+import paddle.trainer_config_helpers as conf_helps
+import paddle.trainer.PyDataProvider2 as pydp2
+
+
+class TestTopology(unittest.TestCase):
+    def test_data_type(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        data_types = topo.data_type()
+        self.assertEqual(len(data_types), 2)
+        pixel_data_type = filter(lambda type: type[0] == "pixel", data_types)
+        self.assertEqual(len(pixel_data_type), 1)
+        pixel_data_type = pixel_data_type[0]
+        self.assertEqual(pixel_data_type[1].type, pydp2.DataType.Dense)
+        self.assertEqual(pixel_data_type[1].dim, 784)
+
+        label_data_type = filter(lambda type: type[0] == "label", data_types)
+        self.assertEqual(len(label_data_type), 1)
+        label_data_type = label_data_type[0]
+        self.assertEqual(label_data_type[1].type, pydp2.DataType.Index)
+        self.assertEqual(label_data_type[1].dim, 10)
+
+    def test_get_layer(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        pixel_layer = topo.get_layer("pixel")
+        label_layer = topo.get_layer("label")
+        self.assertEqual(pixel_layer, pixel)
+        self.assertEqual(label_layer, label)
+
+    def test_parse(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        maxid = layer.max_id(input=inference)
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.cross_entropy_cost(input=inference, label=label)
+
+        topology.Topology(cost2).proto()
+        topology.Topology([cost1]).proto()
+        topology.Topology([cost1, cost2]).proto()
+        topology.Topology([inference, maxid]).proto()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e46e4973f467a017de3d2b45186690af16dd123
--- /dev/null
+++ b/python/paddle/v2/topology.py
@@ -0,0 +1,142 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+from paddle.proto.ModelConfig_pb2 import ModelConfig
+
+import layer as v2_layer
+
+__all__ = ['Topology']
+
+
+def __flatten__(lis):
+    """
+    Given a list, possibly nested to any level, return it flattened.
+    """
+    new_lis = []
+    for item in lis:
+        if isinstance(item, collections.Sequence):
+            new_lis.extend(__flatten__(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+
+
+def __bfs_travel__(callback, *layers):
+    layers = __flatten__(layers)
+    for each_layer in layers:
+        __break__ = callback(each_layer)
+        if __break__:
+            return
+        __layers__ = each_layer.__parent_layers__.values() + \
+                     each_layer.extra_parent()
+        __bfs_travel__(callback, *__layers__)
+
+
+class Topology(object):
+    """
+    Topology is used to store the information about all layers
+    and network configs.
+    """
+
+    def __init__(self, layers, extra_layers=None):
+        def __check__(layers):
+            if not isinstance(layers, collections.Sequence):
+                __check_layer_type__(layers)
+                layers = [layers]
+            for layer in layers:
+                __check_layer_type__(layer)
+            return layers
+
+        layers = __check__(layers)
+        self.layers = layers
+        if extra_layers is not None:
+            extra_layers = __check__(extra_layers)
+
+        self.__model_config__ = v2_layer.parse_network(
+            layers, extra_layers=extra_layers)
+
+        if extra_layers is not None:
+            self.layers.extend(extra_layers)
+
+        assert isinstance(self.__model_config__, ModelConfig)
+
+    def use_sparse_updater(self):
+        """
+        check if any parameter require to use sparse_update
+        :return:
+        """
+        use_sparse = False
+        for parameter in self.__model_config__.parameters:
+            if parameter.sparse_update or parameter.sparse_remote_update:
+                use_sparse = True
+                break
+        return use_sparse
+
+    def proto(self):
+        return self.__model_config__
+
+    def get_layer(self, name):
+        """
+        get v2.Layer Class instance by layer name
+        :param name:
+        :return:
+        """
+        result_layer = [None]
+
+        def __impl__(l):
+            if l.name == name:
+                result_layer[0] = l
+                return True  # break
+            return False
+
+        __bfs_travel__(__impl__, *self.layers)
+        if result_layer[0] is None:
+            raise ValueError("No such layer %s" % name)
+        return result_layer[0]
+
+    def data_layers(self):
+        """
+        get all data layer
+        :return:
+        """
+        data_layers = dict()
+
+        def __impl__(l):
+            if isinstance(l, v2_layer.DataLayerV2):
+                data_layers[l.name] = l
+
+        __bfs_travel__(__impl__, *self.layers)
+        return data_layers
+
+    def data_type(self):
+        """
+        get data_type from proto, such as:
+        [('image', dense_vector(768)), ('label', integer_value(10))]
+        """
+        data_layers = self.data_layers()
+        return [(nm, data_layers[nm].type)
+                for nm in self.proto().input_layer_names]
+
+    def get_layer_proto(self, name):
+        for layer in self.__model_config__.layers:
+            if layer.name == name:
+                return layer
+        return None
+
+
+def __check_layer_type__(layer):
+    if not isinstance(layer, v2_layer.LayerV2):
+        raise ValueError('layer should have type paddle.layer.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9fcfb749f1a858713d3d6672118e521fbdcb32
--- /dev/null
+++ b/python/paddle/v2/trainer.py
@@ -0,0 +1,213 @@
+"""
+Module Trainer
+"""
+import collections
+import gzip
+import os
+
+import py_paddle.swig_paddle as api
+
+from data_feeder import DataFeeder
+from topology import Topology
+from . import event as v2_event
+from . import optimizer as v2_optimizer
+from . import parameters as v2_parameters
+
+__all__ = ['SGD']
+
+
+def default_event_handler(event):
+    """
+    Default event handler. It will print some log and save mode.
+
+    TODO(yuyang18): Complete it!
+    :param event:
+    :return:
+    """
+    pass
+
+
+class SGD(object):
+    """
+    Simple SGD Trainer.
+    SGD Trainer combines data reader, network topolopy and update_equation together
+    to train/test a neural network.
+
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
+    :param cost: Target cost that neural network should be optimized.
+    :type cost: paddle.v2.config_base.Layer
+    :param parameters: The parameters dictionary.
+    :type parameters: paddle.v2.parameters.Parameters
+    :param extra_layers: Some layers in the neural network graph are not
+                         in the path of cost layer.
+    :type extra_layers: paddle.v2.config_base.Layer
+    """
+
+    def __init__(self,
+                 cost,
+                 parameters,
+                 update_equation,
+                 extra_layers=None,
+                 is_local=True):
+
+        if not isinstance(parameters, v2_parameters.Parameters):
+            raise TypeError('parameters should be parameters')
+
+        if not isinstance(update_equation, v2_optimizer.Optimizer):
+            raise TypeError("update equation parameter must be "
+                            "paddle.v2.optimizer.Optimizer")
+        topology = Topology(cost, extra_layers=extra_layers)
+        self.__optimizer__ = update_equation
+        self.__topology__ = topology
+        self.__parameters__ = parameters
+        self.__topology_in_proto__ = topology.proto()
+        self.__is_local__ = is_local
+
+        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
+        # # In local mode, disable sparse_remote_update.
+        if is_local:
+            for param in self.__topology_in_proto__.parameters:
+                if param.sparse_remote_update:
+                    param.sparse_remote_update = False
+
+        self.__gm_create_mode__ = api.CREATE_MODE_NORMAL if not \
+            self.__use_sparse_updater__ else api.CREATE_MODE_SGD_SPARSE_CPU_TRAINING
+        self.__data_types__ = topology.data_type()
+        gm = api.GradientMachine.createFromConfigProto(
+            self.__topology_in_proto__, self.__gm_create_mode__,
+            self.__optimizer__.enable_types())
+        assert isinstance(gm, api.GradientMachine)
+        self.__gradient_machine__ = gm
+        self.__gradient_machine__.randParameters()
+        self.__parameters__.append_gradient_machine(gm)
+        self.__parameter_updater__ = None
+
+    def __use_remote_sparse_updater__(self):
+        return self.__use_sparse_updater__ and not self.__is_local__
+
+    def __prepare_parameter__(self, in_args):
+        """
+        prepare parameter before forward backward.
+        1. When use remote sparse updater, parameters should be got
+        from ps according to input arguments.
+        :param in_args: input arguments of this batch.
+        :return:
+        """
+        if self.__use_remote_sparse_updater__():
+            self.__gradient_machine__.prefetch(in_args)
+            self.__parameter_updater__.getParametersRemote()
+
+    def save_parameter_to_tar(self, f):
+        self.__parameter_updater__.catchUpWith()
+        self.__parameter_updater__.apply()
+        self.__parameter_updater__.getParametersRemote(True, True)
+        self.__parameters__.to_tar(f)
+        self.__parameter_updater__.restore()
+
+    def train(self, reader, num_passes=1, event_handler=None, feeding=None):
+        """
+        Training method. Will train num_passes of input data.
+
+        :param reader: A reader that reads and yeilds data items. Usually we use a
+                       batched reader to do mini-batch training.
+        :type reader: collections.Iterable
+        :param num_passes: The total train passes.
+        :param event_handler: Event handler. A method will be invoked when event
+                              occurred.
+        :type event_handler: (BaseEvent) => None
+        :param feeding: Feeding is a map of neural network input name and array
+                        index that reader returns.
+        :type feeding: dict|list
+        :return:
+        """
+        if event_handler is None:
+            event_handler = default_event_handler
+        __check_train_args__(**locals())
+
+        self.__parameter_updater__ = self.__optimizer__.create_updater(
+            self.__is_local__, num_passes, self.__use_sparse_updater__)
+        self.__parameter_updater__.init(self.__gradient_machine__)
+
+        self.__gradient_machine__.start()
+        batch_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(batch_evaluator, api.Evaluator)
+        pass_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(pass_evaluator, api.Evaluator)
+        out_args = api.Arguments.createArguments(0)
+        feeder = DataFeeder(self.__data_types__, feeding)
+        for pass_id in xrange(num_passes):
+            event_handler(v2_event.BeginPass(pass_id))
+            pass_evaluator.start()
+            self.__parameter_updater__.startPass()
+            for batch_id, data_batch in enumerate(reader()):
+                batch_evaluator.start()
+                event_handler(
+                    v2_event.BeginIteration(
+                        pass_id=pass_id, batch_id=batch_id))
+                pass_type = self.__parameter_updater__.startBatch(
+                    len(data_batch))
+                in_args = feeder(data_batch)
+                self.__prepare_parameter__(in_args)
+                self.__gradient_machine__.forwardBackward(in_args, out_args,
+                                                          pass_type)
+                self.__gradient_machine__.eval(pass_evaluator)
+                self.__gradient_machine__.eval(batch_evaluator)
+                for each_param in self.__gradient_machine__.getNonStaticParameters(
+                ):
+                    self.__parameter_updater__.update(each_param)
+                cost_sum = out_args.sum()
+                cost = cost_sum / len(data_batch)
+                self.__parameter_updater__.finishBatch(cost)
+                batch_evaluator.finish()
+                event_handler(
+                    v2_event.EndIteration(
+                        pass_id=pass_id,
+                        batch_id=batch_id,
+                        cost=cost,
+                        evaluator=batch_evaluator))
+
+            self.__parameter_updater__.finishPass()
+            pass_evaluator.finish()
+            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+        self.__gradient_machine__.finish()
+
+    def test(self, reader, feeding=None):
+        """
+        Testing method. Will test input data.
+
+        :param reader: A reader that reads and yeilds data items.
+        :type reader: collections.Iterable  
+        :param feeding: Feeding is a map of neural network input name and array
+                        index that reader returns.
+        :type feeding: dict
+        :return:
+        """
+        feeder = DataFeeder(self.__data_types__, feeding)
+        evaluator = self.__gradient_machine__.makeEvaluator()
+        out_args = api.Arguments.createArguments(0)
+        evaluator.start()
+        total_cost = 0
+        num_samples = 0.0
+        for data_batch in reader():
+            num_samples += len(data_batch)
+            in_args = feeder(data_batch)
+            self.__prepare_parameter__(in_args)
+            self.__gradient_machine__.forward(in_args, out_args, api.PASS_TEST)
+            total_cost += out_args.sum()
+            self.__gradient_machine__.eval(evaluator)
+
+        evaluator.finish()
+        return v2_event.TestResult(
+            evaluator=evaluator, cost=total_cost / num_samples)
+
+
+def __check_train_args__(reader, event_handler, **kwargs):
+    """
+    Check train function's argument types
+    """
+    if not callable(reader) or not isinstance(reader(), collections.Iterator):
+        raise TypeError('train_data_reader should be a function, '
+                        'which can return a iterator')
+    if not callable(event_handler):
+        raise TypeError('event handler should be a function')
diff --git a/python/setup.py.in b/python/setup.py.in
index b66a42e87c78701e9eb26b1b7dc8f46a95035a76..7d9438e3f8132c2a7fa4774750f5fd15f3beed14 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -4,11 +4,22 @@ packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
-          'paddle.utils']
+          'paddle.utils',
+          'paddle.v2',
+          'paddle.v2.dataset',
+          'paddle.v2.reader',
+          'paddle.v2.plot']
 
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
+      install_requires=[
+          "requests",
+          "numpy",
+          "protobuf==${PROTOBUF_VERSION}",
+          "matplotlib",
+          "opencv-python",
+      ],
       packages=packages,
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}'