diff --git a/.gitignore b/.gitignore
index 1c9730a5ad57cd70613c0692529bcb1ccf056d59..6aae076a49012b032b8fc0f1dc02c2714fb7b4a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ build/
 .pydevproject
 Makefile
 .test_env/
+third_party/
 
 *~
 bazel-*
+third_party/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index f635e65784af47a21df80cc92073ef14eba9a731..0000000000000000000000000000000000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "warp-ctc"]
-	path = warp-ctc
-	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b9902a863d864b28f0fad0fefe64248e356010e4..a6e45028ebc3f53ea20806f0dd2a7acc820607fe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -15,7 +15,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index 047ca6ffe79bdaf013f6ef6dbf1a82bdb2f1f2b3..5a7f45a748ac7e81f3f90c245bcf2cd84c4e9027 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,46 +1,38 @@
 language: cpp
-cache: ccache
+cache:
+  directories:
+    - $HOME/third_party
+    - $HOME/.ccache
+    - $HOME/.cache/pip
 sudo: required
 dist: trusty
 os:
   - linux
-  - osx
 env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
   - JOB=PRE_COMMIT
-matrix:
-  exclude:
-    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux.
-    - os: osx
-      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux
 
 addons:
   apt:
     packages:
       - gcc-4.8
       - g++-4.8
-      - wget
+      - gfortran-4.8
       - git
       - build-essential
-      - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
-      - m4
       - python-numpy
       - python-wheel
-      - libgoogle-glog-dev
-      - libgflags-dev
-      - libgtest-dev
       - curl
-      - lcov
-      - graphviz
       - swig
+      - graphviz
       - clang-format-3.8
       - automake
       - libtool
+      - ccache
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
@@ -53,10 +45,10 @@ before_install:
         fi
       fi
     fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit requests==2.9.2 LinkChecker
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # protobuf version.
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65fbbb481c432f7b905f4dec7ea39c51ec853ae8..4b0682c4fe991a5656e02d6a1845f7d73f61d6ff 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,166 +1,99 @@
-cmake_minimum_required(VERSION 2.8)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+cmake_minimum_required(VERSION 3.0)
 
 project(paddle CXX C)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
-include(package)
-find_package(SWIG 2.0)
-find_package(CUDA QUIET)
-find_package(Protobuf REQUIRED)
-
-# Check protobuf library version.
-execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
-    OUTPUT_VARIABLE PROTOBUF_VERSION)
-string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
-
-set(PROTOBUF_3 OFF)
-if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
-    set(PROTOBUF_3 ON)
-endif()
 
-find_package(PythonLibs 2.7 REQUIRED)
-find_package(PythonInterp 2.7 REQUIRED)
-find_package(ZLIB REQUIRED)
-find_package(NumPy REQUIRED)
-find_package(Threads REQUIRED)
-find_package(AVX QUIET)
-find_package(Glog REQUIRED)
-find_package(Gflags REQUIRED)
-find_package(GTest)
 find_package(Sphinx)
-find_package(Doxygen)
-include(cblas)
-find_program(M4_EXECUTABLE m4)
-###################### Configurations ###########################
-option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
-option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
-option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
-option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
-option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
-option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
-option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
-option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
-option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
-option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
-option(ON_TRAVIS "Running test on travis-ci or not." OFF)
-option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
-option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
-
-
-include(cpplint)
-include(ccache)
-if(WITH_RDMA)
-  include(rdma)
-endif()
-include(util)
-include(flags)
-include(cudnn)
-include(FindPythonModule)
-include(check_packages)
-include(swig)
-include(coveralls)
-
-# Set PaddlePaddle version to Git tag name or Git commit ID.
+find_package(CUDA QUIET)
 find_package(Git REQUIRED)
-# version.cmake will get the current PADDLE_VERSION
-include(version)
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-
-if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
-    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
-    endif()
-
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
-    endif()
-
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
-
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
-if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE)
-    set(ACCURACY double)
-else(WITH_DOUBLE)
-    set(ACCURACY float)
-endif(WITH_DOUBLE)
-
-if(NOT WITH_TIMER)
-    add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
-
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
-
-if(WITH_PYTHON)
-    include_directories(${PYTHON_INCLUDE_DIR})
-    include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
-else(WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
-endif(WITH_PYTHON)
-
-if(WITH_RDMA)
-  include_directories("${RDMA_INC_DIR}")
-else(WITH_RDMA)
-  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
-
-# glog
-include_directories(${LIBGLOG_INCLUDE_DIR})
-
-#gflags
-add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-include_directories(${GFLAGS_INCLUDE_DIRS})
+find_package(Threads REQUIRED)
 
-if(WITH_TESTING)
-    enable_testing()
-    include_directories(${GTEST_INCLUDE_DIRS})
+include(system)
+include(simd)
+
+################################ Configurations #######################################
+option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
+option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
+option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
+option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
+option(WITH_STYLE_CHECK "Compile PaddlePaddle with style check"         ON)
+option(WITH_PYTHON      "Compile PaddlePaddle with python interpreter"  ON)
+option(WITH_DOUBLE      "Compile PaddlePaddle with double precision"    OFF)
+option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
+option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
+option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
+option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
+option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
+option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
+
+# CMAKE_BUILD_TYPE
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING
+      "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
+      FORCE)
 endif()
 
-include_directories("${CBLAS_INC_DIR}")
+set(THIRD_PARTY_PATH "${PROJ_ROOT}/third_party" CACHE STRING
+  "A path setting third party libraries download & build directories.")
+########################################################################################
+
+include(external/zlib)      # download, build, install zlib
+include(external/gflags)    # download, build, install gflags
+include(external/glog)      # download, build, install glog
+include(external/gtest)     # download, build, install gtest
+include(external/protobuf)  # download, build, install protobuf
+include(external/python)    # download, build, install python
+include(external/openblas)  # download, build, install openblas
+include(external/swig)      # download, build, install swig
+include(external/warpctc)   # download, build, install warpctc
+
+include(package)            # set paddle packages
+include(cpplint)            # set paddle c++ style
+include(ccache)             # set ccache for compilation
+include(util)               # set unittest and link libs
+include(rdma)               # set rdma libraries
+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries
+include(version)            # set PADDLE_VERSION
+include(coveralls)          # set code coverage
+
+include(configure)          # add paddle env configuration
+
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
-include_directories(${PROTOBUF_INCLUDE_DIRS})
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-if(EXISTS "${PROJ_ROOT}/paddle/internals/CMakeLists.txt")
-    set(PADDLE_WITH_INTERNAL ON)
-    include(paddle/internals/CMakeLists.txt)
-else()
-    set(PADDLE_WITH_INTERNAL OFF)
-    set(INTERNAL_PROTO_PATH "")
-endif()
+
+set(EXTERNAL_LIBS
+    # have not include gtest here.
+    ${GFLAGS_LIBRARIES}
+    ${GLOG_LIBRARIES}
+    ${CBLAS_LIBRARIES}
+    ${PROTOBUF_LIBRARY}
+    ${ZLIB_LIBRARIES}
+)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
+
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index d319442ef10b38b9edf5844e5540a92c7094c7ce..1c29cb22a31f1e41a6b5575837c6374175cfdea5 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
+    COMMAND cd ${destination} && ln -s ./index_*.html index.html
     )
 
   set_property(
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 685334c6585060c0344e552c6f3fda2c7324de03..235c95f017f2b6ef24195a0210ccafff36b6ed61 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -13,9 +13,11 @@
 # system paths.
 #
 
+set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
@@ -35,11 +37,12 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_PROVIDER MKL)
   set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
-  set(CBLAS_LIBS ${MKL_INTEL_LP64}
+  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
-  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return() # return file.
 endif()
 
@@ -68,9 +71,10 @@ find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
   add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -98,8 +102,9 @@ find_library(OPENBLAS_LIB NAMES openblas
 if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
-  set(CBLAS_LIBS ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -130,9 +135,7 @@ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
 if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBS ${REFERENCE_CBLAS_LIBRARY})
-  return()
+  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_FOUND ON)
 endif()
-
-message(FATAL_ERROR "CBlas must be set. Paddle support MKL, ATLAS, OpenBlas, reference-cblas."
-  " Try set MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT or REFERENCE_CBLAS_ROOT.")
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
deleted file mode 100644
index afb84c6ff52af05769a99246d2e93380832c04e0..0000000000000000000000000000000000000000
--- a/cmake/check_packages.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-# Check package for each cmake option
-
-if(WITH_GPU)
-  find_package(CUDA REQUIRED)  # CUDA is required when use gpu
-endif()
-
-if(WITH_PYTHON)
-  find_package(PythonLibs 2.6 REQUIRED)
-  find_package(PythonInterp REQUIRED)
-  find_package(NumPy REQUIRED)
-endif()
-
-if(WITH_STYLE_CHECK)
-  find_package(PythonInterp REQUIRED)
-endif()
-
-find_package(Glog REQUIRED)
-
-find_package(Gflags REQUIRED)
-
-if(WITH_TESTING)
-  find_package(GTest REQUIRED)
-endif()
-
-if(WITH_DOC)
-  find_package(Sphinx REQUIRED)
-  find_python_module(recommonmark REQUIRED)
-endif()
-
-if(WITH_SWIG_PY)
-  if(NOT SWIG_FOUND)
-    message(FATAL_ERROR "SWIG is not found. Please install swig or disable WITH_SWIG_PY")
-  endif()
-  find_python_module(wheel REQUIRED)  # package wheel
-endif()
-
-if(NOT M4_EXECUTABLE)
-  message(FATAL_ERROR "Paddle need m4 to generate proto file.")
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0bb016201dd8ae912ac8ec9f925bc5277fad7aed
--- /dev/null
+++ b/cmake/configure.cmake
@@ -0,0 +1,68 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(NOT WITH_PYTHON)
+    add_definitions(-DPADDLE_NO_PYTHON)
+endif(NOT WITH_PYTHON)
+
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
+endif(WITH_DOUBLE)
+
+if(NOT WITH_TIMER)
+    add_definitions(-DPADDLE_DISABLE_TIMER)
+endif(NOT WITH_TIMER)
+
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
+if(NOT WITH_GPU)
+    add_definitions(-DPADDLE_ONLY_CPU)
+    add_definitions(-DHPPL_STUB_FUNC)
+
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+else()
+    FIND_PACKAGE(CUDA REQUIRED)
+
+    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+    endif()
+
+    if(NOT CUDNN_FOUND)
+        message(FATAL_ERROR "Paddle need cudnn to compile")
+    endif()
+
+    if(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
+    endif(WITH_AVX)
+
+    # Include cuda and cudnn
+    include_directories(${CUDNN_INCLUDE_DIR})
+    include_directories(${CUDA_TOOLKIT_INCLUDE})
+endif(NOT WITH_GPU)
+
+if(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
+else(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
+endif(WITH_AVX)
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index ae3530c3a0eeb79ddbcbf4f2e99be75aa7968a2f..ad9a10cb8616159b9e3aff445e698cb2edb92820 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -110,14 +110,13 @@ endmacro()
 
 # Get the coverage data.
 file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
-message("GCDA files:")
+message("Process GCDA files:")
+message("===============================")
 
 # Get a list of all the object directories needed by gcov
 # (The directories the .gcda files and .o files are found in)
 # and run gcov on those.
 foreach(GCDA ${GCDA_FILES})
-	message("Process: ${GCDA}")
-	message("------------------------------------------------------------------------------")
 	get_filename_component(GCDA_DIR ${GCDA} PATH)
 
 	#
@@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
@@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
 	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
 
 	# Generate the final JSON for this file.
-	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
 	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
 	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
 endforeach()
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 241af9a0835b2f100c8fb8b246426e631e42aef3..38c636b30edc0af1c07255814e8bc2b1ad9514da 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -53,7 +53,7 @@ macro(add_style_check_target TARGET_NAME)
             if(LINT MATCHES ON)
                 add_custom_command(TARGET ${TARGET_NAME}
                     PRE_BUILD
-                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                                 "--filter=${STYLE_FILTER}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2a49d76eb30f592a28746f5897b14b7dd319d784
--- /dev/null
+++ b/cmake/external/gflags.cmake
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GFLAGS_SOURCES_DIR ${THIRD_PARTY_PATH}/gflags)
+SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
+SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
+IF(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ELSE(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
+
+ExternalProject_Add(
+    gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    PREFIX          ${GFLAGS_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ab105611c812a4f4b642ac5b1213fdfe93fab97d
--- /dev/null
+++ b/cmake/external/glog.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GLOG_SOURCES_DIR ${THIRD_PARTY_PATH}/glog)
+SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
+SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
+
+IF(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+ELSE(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+
+ExternalProject_Add(
+    glog
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS gflags
+    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    PREFIX          ${GLOG_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DWITH_GFLAGS=ON
+    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..11d829a9e2f239848803130505c9862695b25029
--- /dev/null
+++ b/cmake/external/gtest.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WITH_TESTING)
+    ENABLE_TESTING()
+    INCLUDE(ExternalProject)
+
+    SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest)
+    SET(GTEST_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gtest)
+    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
+
+    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
+
+    IF(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+    ELSE(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        gtest
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+        GIT_TAG         "release-1.8.0"
+        PREFIX          ${GTEST_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        CMAKE_ARGS      -DBUILD_GMOCK=ON
+        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
+        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+    )
+    LIST(APPEND external_project_dependencies gtest)
+ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..29d17691db9f4575bae4372c61a0e1964e163fc9
--- /dev/null
+++ b/cmake/external/openblas.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(cblas)
+
+IF(NOT ${CBLAS_FOUND})
+    INCLUDE(ExternalProject)
+
+    SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
+    SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+
+    IF(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE)
+    ELSE(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
+    ENDIF(WIN32)
+
+    IF(CMAKE_COMPILER_IS_GNUCC)
+        ENABLE_LANGUAGE(Fortran)
+        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+    IF(NOT CMAKE_Fortran_COMPILER)
+        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
+                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
+    ENDIF(NOT CMAKE_Fortran_COMPILER)
+
+    ExternalProject_Add(
+        openblas
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
+        PREFIX              ${CBLAS_SOURCES_DIR}
+        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+        BUILD_IN_SOURCE     1
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+
+    ExternalProject_Add_Step(
+        openblas lapacke_install
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
+        DEPENDEES install
+    )
+
+    LIST(APPEND external_project_dependencies openblas)
+ENDIF(NOT ${CBLAS_FOUND})
+
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..26da7e8e384bafdcbcd1a358c39cc6eb167b067e
--- /dev/null
+++ b/cmake/external/protobuf.cmake
@@ -0,0 +1,63 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+FIND_PACKAGE(Protobuf)
+
+IF(NOT PROTOBUF_FOUND)
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
+    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+    IF(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+    ELSE(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX          ${PROTOBUF_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        DEPENDS         zlib
+        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        CONFIGURE_COMMAND
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+        -Dprotobuf_BUILD_TESTS=OFF
+        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR=lib
+    )
+
+    LIST(APPEND external_project_dependencies protobuf)
+ENDIF(NOT PROTOBUF_FOUND)
+
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..0accf1a8dd83560324716f0f4936be56dd7a9f1b
--- /dev/null
+++ b/cmake/external/python.cmake
@@ -0,0 +1,223 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+INCLUDE(python_module)
+
+FIND_PACKAGE(PythonInterp 2.7)
+FIND_PACKAGE(PythonLibs 2.7)
+
+SET(py_env "")
+
+IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    find_python_module(pip REQUIRED)
+    find_python_module(numpy REQUIRED)
+    find_python_module(wheel REQUIRED)
+    find_python_module(google.protobuf REQUIRED)
+    FIND_PACKAGE(NumPy REQUIRED)
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
+        "please use pip to upgrade protobuf. pip install -U protobuf")
+    ENDIF()
+ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
+    ##################################### PYTHON ########################################
+    SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
+    SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
+    SET(_python_DIR ${PYTHON_INSTALL_DIR})
+
+    IF(UNIX)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSEIF(WIN32)
+        SET(PYTHON_FOUND ON)
+        SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
+        SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
+        SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
+        SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
+    ELSE()
+        MESSAGE(FATAL_ERROR "Unknown system !")
+    ENDIF()
+
+    IF(APPLE)
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
+            -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
+            )
+    ENDIF()
+
+    SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
+
+    # Force Python build to "Release".
+    IF(CMAKE_CONFIGURATION_TYPES)
+        SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
+        SET(CMAKE_CFG_INTDIR "Release")
+    ELSE()
+        LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
+            -DCMAKE_BUILD_TYPE:STRING=Release
+            )
+    ENDIF()
+
+    ExternalProject_Add(python
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
+        PREFIX            ${PYTHON_SOURCES_DIR}
+        UPDATE_COMMAND    ""
+        CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
+        CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+        CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+        CMAKE_CACHE_ARGS
+            -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
+            -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
+            -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
+            -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+            -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
+            -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
+            -DDOWNLOAD_SOURCES:BOOL=ON
+            -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
+            ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
+        DEPENDS zlib
+    )
+
+    SET(py_env
+        PATH=${PYTHON_INSTALL_DIR}/bin
+        PYTHONHOME=${PYTHON_INSTALL_DIR}
+        PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
+    ####################################################################################
+
+    ##################################### SETUPTOOLS ###################################
+    SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
+    ExternalProject_Add(setuptools
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SETUPTOOLS_SOURCES_DIR}
+        URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python zlib
+    )
+    #####################################################################################
+
+    ##################################### SIX ###########################################
+    SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
+    ExternalProject_Add(six
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX              ${SIX_SOURCES_DIR}
+        URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
+        BUILD_IN_SOURCE     1
+        PATCH_COMMAND       ""
+        UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+        INSTALL_COMMAND     ""
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS             python setuptools
+    )
+    #####################################################################################
+
+    ##################################### CYTHON ########################################
+    SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
+    ExternalProject_Add(cython
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX                ${CYTHON_SOURCES_DIR}
+        URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
+        GIT_TAG               0.25.2
+        BUILD_IN_SOURCE       1
+        CONFIGURE_COMMAND     ""
+        PATCH_COMMAND         ""
+        UPDATE_COMMAND        ""
+        INSTALL_COMMAND       ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python
+    )
+    ####################################################################################
+
+    ##################################### NUMPY ########################################
+    SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
+    SET(NUMPY_TAG_VERSION "v1.11.3")
+    SET(NUMPY_VERSION "1.11.3")
+
+    SET(EGG_NAME "")
+    SET(PYTHON_NUMPY_INCLUDE_DIR "")
+    IF(WIN32)
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
+    ELSE(WIN32)
+        IF(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
+        ELSE(APPLE)
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+            SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+        ENDIF(APPLE)
+
+        FOREACH(suffix x86_64 intel fat64 fat32 universal)
+            LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
+        ENDFOREACH()
+    ENDIF(WIN32)
+
+    ExternalProject_Add(numpy
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/numpy/numpy.git
+        GIT_TAG             ${NUMPY_TAG_VERSION}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        PREFIX              ${NUMPY_SOURCES_DIR}
+        BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools cython
+    )
+    ####################################################################################
+
+    ##################################### WHEEL ########################################
+    SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
+    ExternalProject_Add(wheel
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
+        PREFIX              ${WHEEL_SOURCES_DIR}
+        CONFIGURE_COMMAND   ""
+        UPDATE_COMMAND      ""
+        BUILD_COMMAND       ""
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS             python setuptools
+    )
+    ####################################################################################
+
+    ################################### PROTOBUF #######################################
+    SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
+    ExternalProject_Add(python-protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
+        URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
+        PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
+        BUILD_IN_SOURCE       1
+        PATCH_COMMAND         ""
+        CONFIGURE_COMMAND     ""
+        BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+        INSTALL_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        DEPENDS               python setuptools six
+    )
+    ####################################################################################
+
+    LIST(APPEND external_project_dependencies python setuptools six cython wheel python-protobuf numpy)
+
+ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..744c766ee7b067058b2cb4aa7f7b761cbb9778d4
--- /dev/null
+++ b/cmake/external/swig.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FIND_PACKAGE(SWIG)
+
+IF(NOT SWIG_FOUND)
+    # build swig as an external project
+    INCLUDE(ExternalProject)
+
+    SET(SWIG_SOURCES_DIR ${THIRD_PARTY_PATH}/swig)
+    SET(SWIG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/swig)
+    SET(SWIG_TARGET_VERSION "3.0.2")
+    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
+    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
+
+    IF(WIN32)
+        # swig.exe available as pre-built binary on Windows:
+        ExternalProject_Add(swig
+            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
+            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
+            SOURCE_DIR          ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     ""
+            UPDATE_COMMAND      ""
+        )
+        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
+        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
+    ELSE(WIN32)
+        # swig uses bison find it by cmake and pass it down
+        FIND_PACKAGE(BISON)
+
+        # From SWIG configure
+        ExternalProject_Add(swig
+            GIT_REPOSITORY      https://github.com/swig/swig.git
+            GIT_TAG             rel-3.0.10
+            PREFIX              ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   cd <SOURCE_DIR> && ./autogen.sh && ./configure
+                                --prefix=${SWIG_INSTALL_DIR} --without-pcre
+            BUILD_COMMAND       cd <SOURCE_DIR> && make
+            INSTALL_COMMAND     cd <SOURCE_DIR> && make install
+            UPDATE_COMMAND      ""
+        )
+
+        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
+        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
+    ENDIF(WIN32)
+
+    LIST(APPEND external_project_dependencies swig)
+ENDIF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..172c318b35d611d0432b78f2a18eb58a7d272b90
--- /dev/null
+++ b/cmake/external/warpctc.cmake
@@ -0,0 +1,61 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
+SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
+
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+IF(WIN32)
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ELSE(WIN32)
+    IF(APPLE)
+        SET(_warpctc_SHARED_SUFFIX dylib)
+    ELSE(APPLE)
+        SET(_warpctc_SHARED_SUFFIX so)
+    ENDIF(APPLE)
+
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
+
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+    SET(USE_OMP OFF)
+ELSE()
+    SET(USE_OMP ON)
+ENDIF()
+
+ExternalProject_Add(
+    warpctc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    PREFIX          ${WARPCTC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
+    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+    CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
+    CMAKE_ARGS      -DBUILD_SHARED=ON
+)
+
+LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..47fa8817fb64fb8fd718e2892ad5bae7bbe956eb
--- /dev/null
+++ b/cmake/external/zlib.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(ZLIB_SOURCES_DIR ${THIRD_PARTY_PATH}/zlib)
+SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
+SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
+SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
+
+IF(WIN32)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
+ELSE(WIN32)
+  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    zlib
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+    GIT_TAG         "v1.2.8"
+    PREFIX          ${ZLIB_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+)
+
+LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0983d83b73a32d0615170155759d45001cc6ff54..b76852fc6c50e80633c8294fb2724b83f15293a7 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -3,12 +3,6 @@ include(CheckCXXCompilerFlag)
 include(CheckCCompilerFlag)
 include(CheckCXXSymbolExists)
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
-        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
-        FORCE)
-endif()
-
 function(CheckCompilerCXX11Flag)
     if(CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
@@ -102,6 +96,7 @@ set(COMMON_FLAGS
     -Wno-unused-parameter
     -Wno-unused-function
     -Wno-error=literal-suffix
+    -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs)
 
 set(GPU_COMMON_FLAGS
@@ -111,6 +106,7 @@ set(GPU_COMMON_FLAGS
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
     -Wno-unused-function
+    -Wno-error=sign-compare
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
diff --git a/cmake/FindPythonModule.cmake b/cmake/python_module.cmake
similarity index 72%
rename from cmake/FindPythonModule.cmake
rename to cmake/python_module.cmake
index 2eb3441428e8290b665e092f6e4b40e146ea5a52..1412b7f7f20600acf95a4a899f5e6529c3b67a35 100644
--- a/cmake/FindPythonModule.cmake
+++ b/cmake/python_module.cmake
@@ -26,5 +26,18 @@ function(find_python_module module)
     if(NOT PY_${module_upper}_FOUND AND ${module}_FIND_REQUIRED)
         message(FATAL_ERROR "python module ${module} is not found")
     endif()
+
+    execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c"
+        "import sys, ${module}; sys.stdout.write(${module}.__version__)"
+        OUTPUT_VARIABLE _${module}_version
+        RESULT_VARIABLE _${module}_status
+        ERROR_QUIET
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT _${module}_status)
+        set(PY_${module_upper}_VERSION ${_${module}_version} CACHE STRING
+            "Version of Python module ${module}")
+    endif(NOT _${module}_status)
+
     set(PY_${module_upper}_FOUND ${PY_${module_upper}_FOUND} PARENT_SCOPE)
+    set(PY_${module_upper}_VERSION ${PY_${module_upper}_VERSION} PARENT_SCOPE)
 endfunction(find_python_module)
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
index e9a4da79aa92a92aa7e5d21bb795ab9aaf60ab8b..9ff1a77cac74fb1bdfe470a78d225ed1767bb1b5 100644
--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@@ -5,72 +5,76 @@
 # svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
 # we use static output in svn repositories to avoid implict bugs from not standard runtime env.
 
-set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+if(WITH_RDMA)
+  set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
 
-function(generate_rdma_links)
-  #redirect to current DIR to isolate the pollution from system runtime environment
-  #it can benifits unified control for different gcc environment. 
-  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
-  #runtime libraries that will crash process while loading it. That redirect trick
-  #can fix it.
-  execute_process(
-    COMMAND mkdir -p librdma
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-  )
-endfunction(generate_rdma_links)
-
-
-#check and set headers
-find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
-find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-#check and set libs
-find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
-find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-if(
-    RDMA_INC_SXISOCK AND
-    RDMA_INC_XIO AND
-    RDMA_INC_EVENT AND
-    RDMA_INC_NUMA AND
-    RDMA_LIB_SXISOCK AND 
-    RDMA_LIB_XIO AND
-    RDMA_LIB_EVENT AND
-    RDMA_LIB_EVENT_CORE AND
-    RDMA_LIB_EVENT_EXTRA AND
-    RDMA_LIB_EVENT_PTHREADS AND
-    RDMA_LIB_NUMA
+  function(generate_rdma_links)
+    #redirect to current DIR to isolate the pollution from system runtime environment
+    #it can benifits unified control for different gcc environment. 
+    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+    #runtime libraries that will crash process while loading it. That redirect trick
+    #can fix it.
+    execute_process(
+      COMMAND mkdir -p librdma
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     )
+  endfunction(generate_rdma_links)
 
-  set(RDMA_INC_DIR 
-    ${RDMA_INC_SXISOCK} 
-    ${RDMA_INC_XIO}
-    ${RDMA_INC_EVENT}
-    ${RDMA_INC_NUMA})
-  set(RDMA_LIBS  
-    ${RDMA_LIB_SXISOCK} 
-    ${RDMA_LIB_XIO} 
-    ${RDMA_LIB_EVENT} 
-    ${RDMA_LIB_EVENT_CORE} 
-    ${RDMA_LIB_EVENT_EXTRA} 
-    ${RDMA_LIB_EVENT_PTHREADS} 
-    ${RDMA_LIB_NUMA} 
-    )
-  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
-  return()
-endif()
+  #check and set headers
+  find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+  find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+  #check and set libs
+  find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+  find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
 
-#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+  if(
+      RDMA_INC_SXISOCK AND
+      RDMA_INC_XIO AND
+      RDMA_INC_EVENT AND
+      RDMA_INC_NUMA AND
+      RDMA_LIB_SXISOCK AND 
+      RDMA_LIB_XIO AND
+      RDMA_LIB_EVENT AND
+      RDMA_LIB_EVENT_CORE AND
+      RDMA_LIB_EVENT_EXTRA AND
+      RDMA_LIB_EVENT_PTHREADS AND
+      RDMA_LIB_NUMA
+      )
 
-message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+    set(RDMA_INC_DIR 
+      ${RDMA_INC_SXISOCK} 
+      ${RDMA_INC_XIO}
+      ${RDMA_INC_EVENT}
+      ${RDMA_INC_NUMA})
+    set(RDMA_LIBS  
+      ${RDMA_LIB_SXISOCK} 
+      ${RDMA_LIB_XIO} 
+      ${RDMA_LIB_EVENT} 
+      ${RDMA_LIB_EVENT_CORE} 
+      ${RDMA_LIB_EVENT_EXTRA} 
+      ${RDMA_LIB_EVENT_PTHREADS} 
+      ${RDMA_LIB_NUMA} 
+      )
+    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+    include_directories("${RDMA_INC_DIR}")
+  else()
+    #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+    message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+  endif()
+else(WITH_RDMA)
+  set(RDMA_LIBS "")
+  set(RDMA_LD_FLAGS "")
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)
diff --git a/cmake/FindAVX.cmake b/cmake/simd.cmake
similarity index 100%
rename from cmake/FindAVX.cmake
rename to cmake/simd.cmake
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
deleted file mode 100644
index 97e87aa947791e2c5a88e7e554dec43bcd661664..0000000000000000000000000000000000000000
--- a/cmake/swig.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-function(generate_python_api target_name)
-    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    add_custom_target(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py)
-endfunction(generate_python_api)
diff --git a/cmake/system.cmake b/cmake/system.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..3e472da7e0bd9c433f92f3e8b52970cd2cc6dcba
--- /dev/null
+++ b/cmake/system.cmake
@@ -0,0 +1,78 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
+# but the name like centos is necessary in some scenes
+# to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+
+IF(WIN32)
+    SET(HOST_SYSTEM "win32")
+ELSE(WIN32)
+    IF(APPLE)
+        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
+        SET(MACOS_VERSION ${VERSION})
+        SET(HOST_SYSTEM "macosx")
+    ELSE(APPLE)
+
+        IF(EXISTS "/etc/issue")
+            FILE(READ "/etc/issue" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ELSEIF(LINUX_ISSUE MATCHES "Debian")
+                SET(HOST_SYSTEM "debian")
+            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
+                SET(HOST_SYSTEM "ubuntu")
+            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
+                SET(HOST_SYSTEM "redhat")
+            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
+                SET(HOST_SYSTEM "fedora")
+            ENDIF()
+        ENDIF(EXISTS "/etc/issue")
+
+        IF(EXISTS "/etc/redhat-release")
+            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ENDIF()
+        ENDIF(EXISTS "/etc/redhat-release")
+
+        IF(NOT HOST_SYSTEM)
+            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+        ENDIF()
+
+    ENDIF(APPLE)
+ENDIF(WIN32)
+
+# query number of logical cores
+CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+
+MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
+
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+
+# external dependencies log output
+SET(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD    0     # Wrap download in script to log output
+    LOG_UPDATE      1     # Wrap update in script to log output
+    LOG_CONFIGURE   1     # Wrap configure in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
+    LOG_TEST        1     # Wrap test in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
+)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 8a71b23c62d9fd79ffeb6b1b2281e0f2728db5a8..24ad5c815ca20d9b6b317b1be4d2dc93a9e06fba 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -24,7 +24,7 @@ function(target_circle_link_libraries TARGET_NAME)
                 list(APPEND libsInArgn ${arg})
             endif()
         endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
             list(APPEND LIBS "-undefined dynamic_lookup")
         endif()
         list(REVERSE libsInArgn)
@@ -81,18 +81,6 @@ function(link_paddle_exe TARGET_NAME)
         set(METRIC_LIBS "")
     endif()
 
-    if(PADDLE_WITH_INTERNAL)
-        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_internal_gserver
-            paddle_internal_owlqn
-            ARCHIVE_END
-            paddle_internal_parameter)
-    else()
-        set(INTERAL_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
@@ -107,29 +95,20 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        paddle_test_main
         ${METRIC_LIBS}
-        ${PROTOBUF_LIBRARY}
-        ${LIBGLOG_LIBRARY}
-        ${GFLAGS_LIBRARIES}
+        ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
-        ${CBLAS_LIBS}
-        ${ZLIB_LIBRARIES}
-        ${INTERAL_LIBS}
-        ${CMAKE_DL_LIBS})
-
-    if(WITH_RDMA)
-        target_link_libraries(${TARGET_NAME}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
+        ${CMAKE_DL_LIBS}
+        ${RDMA_LD_FLAGS}
+        ${RDMA_LIBS})
 
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES})
+            ${PYTHON_LIBRARIES} util)
     endif()
 
     if(WITH_GPU)
+        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
         if(NOT WITH_DSO OR WITH_METRIC)
             target_link_libraries(${TARGET_NAME}
                 ${CUDNN_LIBRARY}
@@ -143,10 +122,7 @@ function(link_paddle_exe TARGET_NAME)
         endif()
     endif()
 
-    if(NOT WITH_DSO)
-        target_link_libraries(${TARGET_NAME}
-            ${WARPCTC_LIBRARY})
-    endif()
+    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
 # link_paddle_test
@@ -155,8 +131,10 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
     link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
-        ${GTEST_LIBRARIES})
+    target_link_libraries(${TARGET_NAME}
+                          paddle_test_main
+                          paddle_test_util
+                          ${GTEST_LIBRARIES})
 endfunction()
 
 # add_unittest_without_exec
diff --git a/cmake/version.cmake b/cmake/version.cmake
index a0518e07e88a1ff468c301523f888c7d95e15185..ac1583a24c828629c46cb9cf4e965f8da2273732 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -21,4 +21,5 @@ while ("${PADDLE_VERSION}" STREQUAL "")
   endif()
 endwhile()
 
+add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
 message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d20540780becf504973a23b50445d4b65dc2ef
--- /dev/null
+++ b/demo/image_classification/api_v2_resnet.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['resnet_cifar10']
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+
+
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+
+
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+
+
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+
+
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232
--- /dev/null
+++ b/demo/image_classification/api_v2_train.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import sys
+
+import paddle.v2 as paddle
+
+from api_v2_vgg import vgg_bn_drop
+
+
+def main():
+    datadim = 3 * 32 * 32
+    classdim = 10
+
+    # PaddlePaddle init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+
+    # Add neural network config
+    # option 1. resnet
+    # net = resnet_cifar10(image, depth=32)
+    # option 2. vgg
+    net = vgg_bn_drop(image)
+
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    momentum_optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+        learning_rate=0.1 / 128.0,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=50000 * 100,
+        learning_rate_schedule='discexp',
+        batch_size=128)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.cifar.test10(), batch_size=128),
+                feeding={'image': 0,
+                         'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=momentum_optimizer)
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10(), buf_size=50000),
+            batch_size=128),
+        num_passes=5,
+        event_handler=event_handler,
+        feeding={'image': 0,
+                 'label': 1})
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0e6b93adde30425f17aa9cd07542275f4fec37
--- /dev/null
+++ b/demo/image_classification/api_v2_vgg.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['vgg_bn_drop']
+
+
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=ipt,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=paddle.pooling.Max())
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+    bn = paddle.layer.batch_norm(
+        input=fc1,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+    return fc2
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 9a86aafcb2fa4d4354d1dd9443c1b73ddcda980b..49c0ff600c40e0222093ff0a8a2f7e8e38ccba29 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -126,7 +126,7 @@ class ImageClassifier():
         # For oversampling, average predictions across crops.
         # If not, the shape of output[name]: (1, class_number),
         # the mean is also applicable.
-        return output[output_layer].mean(0)
+        return output[output_layer]['value'].mean(0)
 
     def predict(self, image=None, output_layer=None):
         assert isinstance(image, basestring)
diff --git a/demo/image_classification/train.sh b/demo/image_classification/train.sh
index 6fc11caf1c75192242482c2e85f8167eb9fba4ec..e45bd47ad5925c6674d628a70a7ad7c4d5d5c173 100755
--- a/demo/image_classification/train.sh
+++ b/demo/image_classification/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=300 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "image_classification_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..84125c3b4b621a128fd488ff7fa374a75f620bf1
--- /dev/null
+++ b/demo/introduction/api_train_v2.py
@@ -0,0 +1,58 @@
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.regression_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
index b7bbb90ddd287e3e312a490b53924ae76fb20d2c..2ce6446d7c943ffc9bea8da43d153539f6f9f15f 100755
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
@@ -19,3 +19,4 @@ paddle train \
     --save_dir=./output \
     --num_passes=30 \
     2>&1 |tee 'train.log'
+paddle usage -l "train.log" -e $? -n "introduction" >/dev/null 2>&1
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
index 8bd9837523ccf98e6e72d5b82934b7b104816217..7e61d5e3a0cabd46d4185454d46610ac2ee2e63f 100644
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -5,3 +5,6 @@ plot.png
 train.log
 *pyc
 .ipynb_checkpoints
+params.pkl
+params.tar
+params.tar.gz
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
index f301da382ff8a5bc16d9c18b956f78566ed4894f..ea1caa7dd9653a2cc2860ace736fe3d25a3767e0 100644
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@@ -6,33 +6,15 @@ passed to C++ side of Paddle.
 
 The user api could be simpler and carefully designed.
 """
-import py_paddle.swig_paddle as api
-from py_paddle import DataProviderConverter
-import paddle.trainer.PyDataProvider2 as dp
-import numpy as np
 import random
-from mnist_util import read_from_mnist
-from paddle.trainer_config_helpers import *
-
-
-def optimizer_config():
-    settings(
-        learning_rate=1e-4,
-        learning_method=AdamOptimizer(),
-        batch_size=1000,
-        model_average=ModelAverage(average_window=0.5),
-        regularization=L2Regularization(rate=0.5))
 
+import numpy as np
+import paddle.v2 as paddle_v2
+import py_paddle.swig_paddle as api
+from paddle.trainer_config_helpers import *
+from py_paddle import DataProviderConverter
 
-def network_config():
-    imgs = data_layer(name='pixel', size=784)
-    hidden1 = fc_layer(input=imgs, size=200)
-    hidden2 = fc_layer(input=hidden1, size=200)
-    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-    cost = classification_cost(
-        input=inference, label=data_layer(
-            name='label', size=10))
-    outputs(cost)
+from mnist_util import read_from_mnist
 
 
 def init_parameter(network):
@@ -75,19 +57,35 @@ def input_order_converter(generator):
 def main():
     api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
 
-    # get enable_types for each optimizer.
-    # enable_types = [value, gradient, momentum, etc]
-    # For each optimizer(SGD, Adam), GradientMachine should enable different
-    # buffers.
-    opt_config_proto = parse_optimizer_config(optimizer_config)
-    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
-    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
-    enable_types = _temp_optimizer_.getParameterTypes()
+    optimizer = paddle_v2.optimizer.Adam(
+        learning_rate=1e-4,
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = optimizer.create_local_updater()
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # define network
+    images = paddle_v2.layer.data(
+        name='pixel', type=paddle_v2.data_type.dense_vector(784))
+    label = paddle_v2.layer.data(
+        name='label', type=paddle_v2.data_type.integer_value(10))
+    hidden1 = paddle_v2.layer.fc(input=images, size=200)
+    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
+    inference = paddle_v2.layer.fc(input=hidden2,
+                                   size=10,
+                                   act=paddle_v2.activation.Softmax())
+    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
 
     # Create Simple Gradient Machine.
-    model_config = parse_network_config(network_config)
-    m = api.GradientMachine.createFromConfigProto(
-        model_config, api.CREATE_MODE_NORMAL, enable_types)
+    model_config = paddle_v2.layer.parse_network(cost)
+    m = api.GradientMachine.createFromConfigProto(model_config,
+                                                  api.CREATE_MODE_NORMAL,
+                                                  optimizer.enable_types())
 
     # This type check is not useful. Only enable type hint in IDE.
     # Such as PyCharm
@@ -96,19 +94,12 @@ def main():
     # Initialize Parameter by numpy.
     init_parameter(network=m)
 
-    # Create Local Updater. Local means not run in cluster.
-    # For a cluster training, here we can change to createRemoteUpdater
-    # in future.
-    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
-    assert isinstance(updater, api.ParameterUpdater)
-
     # Initialize ParameterUpdater.
     updater.init(m)
 
     # DataProvider Converter is a utility convert Python Object to Paddle C++
     # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
-        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+    converter = DataProviderConverter(input_types=[images.type, label.type])
 
     train_file = './data/raw_data/train'
     test_file = './data/raw_data/t10k'
diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..68761be80f24f074c041109d6769e84fa7204367
--- /dev/null
+++ b/demo/mnist/api_train_v2.py
@@ -0,0 +1,141 @@
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+
+    try:
+        with gzip.open('params.tar.gz', 'r') as f:
+            parameters = paddle.parameters.Parameters.from_tar(f)
+    except IOError:
+        parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                result = trainer.test(reader=paddle.batch(
+                    paddle.dataset.mnist.test(), batch_size=256))
+
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+                with gzip.open('params.tar.gz', 'w') as f:
+                    parameters.to_tar(f)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output=predict,
+        parameters=parameters,
+        reader=paddle.batch(
+            paddle.reader.firstn(
+                paddle.reader.map_readers(lambda item: (item[0], ),
+                                          paddle.dataset.mnist.test()),
+                n=100),
+            batch_size=32))
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/mnist/train.sh b/demo/mnist/train.sh
index da90cd749a02976633d0f0d6e4352d8a85c7cdef..ca2b1ad9eb960685b95b0f294a9b929e1a4acab1 100755
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
@@ -27,5 +27,6 @@ paddle train \
 --num_passes=100 \
 --save_dir=$output \
 2>&1 | tee $log
+paddle usage -l $log -e $? -n "mnist_train" >/dev/null 2>&1
 
 python -m paddle.utils.plotcurve -i $log > plot.png
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index f97ef2610734449c88fdfca6216b1cab57472b84..f61c65a935c76032a06613cfe0b50f1c90bc50d9 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -14,9 +14,19 @@
 # limitations under the License.
 set -e
 set -x
+BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
 
-# download the dictionary and pretrained model 
-for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
+          f88c8325ee6da6187f1080e8fe66c1cd
+          927cf70f27f860aff1a5703ebf7f1584
+	  a52e43655cd25d279777ed509a1ae27b
+	  b92c67fe9ff70fea53596080e351ac80)
+
+for ((i=0; i<${#ITEM_MD5[@]}; i++))
+do
+  FILENAME=${DOWNLOAD_ITEMS[${i}]}
+  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
+  EXPECTED_MD5=${ITEM_MD5[${i}]}
+  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
 done
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index 4631816c43ef48839df1863a0a86c3ab00924d3f..6074cc1d3a85e13e3e8d336d81e22104f9d8e7cf 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -156,7 +156,7 @@ class ImageClassifier():
             # For oversampling, average predictions across crops.
             # If not, the shape of output[name]: (1, class_number),
             # the mean is also applicable.
-            res[name] = output[name].mean(0)
+            res[name] = output[name]['value'].mean(0)
 
         return res
 
diff --git a/demo/quick_start/cluster/cluster_train.sh b/demo/quick_start/cluster/cluster_train.sh
index aac9b89b14b98ac8e2db7def19e5f06c01682493..a7b1f01064b29cf6abc4cd6b706ee466a6d6da36 100755
--- a/demo/quick_start/cluster/cluster_train.sh
+++ b/demo/quick_start/cluster/cluster_train.sh
@@ -25,6 +25,7 @@ log_file="$bin_dir/train.log"
 pushd "$home_dir"
 cfg=trainer_config.lr.py
 paddle train \
+  --start_pserver=false \
   --config=$cfg \
   --save_dir=${model_dir} \
   --trainer_count=4 \
diff --git a/demo/quick_start/predict.sh b/demo/quick_start/predict.sh
index f02e5038e92790c7f1ddcd84a09c6d9a02f84ac4..e47c2dd01fb5c919203964e298018e6dc2bd366e 100755
--- a/demo/quick_start/predict.sh
+++ b/demo/quick_start/predict.sh
@@ -26,5 +26,7 @@ paddle train \
     --init_model_path=$model \
     --config_args=is_predict=1 \
     --predict_output_dir=. \
+2>&1 | tee 'predict.log'
+paddle usage -l 'predict.log' -e $? -n "quick_start_predict_${cfg}" >/dev/null 2>&1
 
 mv rank-00000 result.txt
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index e3595fce7519297058e1eeb66487692267ddcfcc..01697fed48054be8ad98a01d4cbb5029e6a1ead0 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -31,3 +31,4 @@ paddle train \
   --show_parameter_stats_period=100 \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l "train.log" -e $? -n "quick_start_${cfg}" >/dev/null 2>&1
diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py
new file mode 100755
index 0000000000000000000000000000000000000000..3afa7a1e9db5fefb1bbf5aaa174b8168afae4058
--- /dev/null
+++ b/demo/recommendation/evaluate.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
diff --git a/demo/recommendation/run.sh b/demo/recommendation/run.sh
index e341d1cc7a3267bef9db916719b2e4b1981e31bc..22aef556082ba429e9ca7c6dd3ec72699b9dbcf4 100755
--- a/demo/recommendation/run.sh
+++ b/demo/recommendation/run.sh
@@ -22,3 +22,4 @@ paddle train \
     --log_period=100 \
     --dot_period=1 \
     --num_passes=50  2>&1 | tee 'log.txt'
+paddle usage -l log.txt -e $? -n "recommendation" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..036cad4b0a32357bb42580ef577a1eba558be8fe
--- /dev/null
+++ b/demo/semantic_role_labeling/api_train_v2.py
@@ -0,0 +1,190 @@
+import sys
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+
+
+def db_lstm():
+    word_dict, verb_dict, label_dict = conll05.get_dict()
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+    pred_len = len(verb_dict)
+
+    mark_dict_len = 2
+    word_dim = 32
+    mark_dim = 5
+    hidden_dim = 512
+    depth = 8
+
+    #8 features
+    def d_type(size):
+        return paddle.data_type.integer_value_sequence(size)
+
+    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+
+    default_std = 1 / math.sqrt(hidden_dim) / 3.0
+
+    emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
+    std_0 = paddle.attr.Param(initial_std=0.)
+    std_default = paddle.attr.Param(initial_std=default_std)
+
+    predicate_embedding = paddle.layer.embedding(
+        size=word_dim,
+        input=predicate,
+        param_attr=paddle.attr.Param(
+            name='vemb', initial_std=default_std))
+    mark_embedding = paddle.layer.embedding(
+        size=mark_dim, input=mark, param_attr=std_0)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        paddle.layer.embedding(
+            size=word_dim, input=x, param_attr=emb_para) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0 = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=emb, param_attr=std_default) for emb in emb_layers
+        ])
+
+    mix_hidden_lr = 1e-3
+    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=default_std, learning_rate=mix_hidden_lr)
+
+    lstm_0 = paddle.layer.lstmemory(
+        input=hidden_0,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    #stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = paddle.layer.mixed(
+            size=hidden_dim,
+            bias_attr=std_default,
+            input=[
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[0], param_attr=hidden_para_attr),
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[1], param_attr=lstm_para_attr)
+            ])
+
+        lstm = paddle.layer.lstmemory(
+            input=mix_hidden,
+            act=paddle.activation.Relu(),
+            gate_act=paddle.activation.Sigmoid(),
+            state_act=paddle.activation.Sigmoid(),
+            reverse=((i % 2) == 1),
+            bias_attr=std_0,
+            param_attr=lstm_para_attr)
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = paddle.layer.mixed(
+        size=label_dict_len,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ], )
+
+    crf_cost = paddle.layer.crf(size=label_dict_len,
+                                input=feature_out,
+                                label=target,
+                                param_attr=paddle.attr.Param(
+                                    name='crfw',
+                                    initial_std=default_std,
+                                    learning_rate=mix_hidden_lr))
+
+    crf_dec = paddle.layer.crf_decoding(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=paddle.attr.Param(name='crfw'))
+
+    return crf_cost, crf_dec
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    crf_cost, crf_dec = db_lstm()
+
+    # create parameters
+    parameters = paddle.parameters.create([crf_cost, crf_dec])
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-2,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    trainer = paddle.trainer.SGD(cost=crf_cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+
+    trn_reader = paddle.batch(
+        paddle.reader.shuffle(
+            conll05.test(), buf_size=8192), batch_size=10)
+
+    feeding = {
+        'word_data': 0,
+        'ctx_n2_data': 1,
+        'ctx_n1_data': 2,
+        'ctx_0_data': 3,
+        'ctx_p1_data': 4,
+        'ctx_p2_data': 5,
+        'verb_data': 6,
+        'mark_data': 7,
+        'target': 8
+    }
+
+    trainer.train(
+        reader=trn_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        feeding=feeding)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/semantic_role_labeling/test.sh b/demo/semantic_role_labeling/test.sh
index 11d9d6a19c1b17ad1b7540ee7a03017f85dd821e..095bbff2ea42627a13d8ebab436f5a05abc09743 100755
--- a/demo/semantic_role_labeling/test.sh
+++ b/demo/semantic_role_labeling/test.sh
@@ -38,3 +38,4 @@ paddle train \
   --config_args=is_test=1 \
   --test_all_data_in_one_period=1 \
 2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "semantic_role_labeling_test" >/dev/null 2>&1
diff --git a/demo/semantic_role_labeling/train.sh b/demo/semantic_role_labeling/train.sh
index 9354e72f46dc4dfc46138a04c330933d404c6cb8..eee14010d7b04a1b824f39090fa82fc532085e0d 100755
--- a/demo/semantic_role_labeling/train.sh
+++ b/demo/semantic_role_labeling/train.sh
@@ -27,3 +27,4 @@ paddle train \
   --load_missing_parameter_strategy=rand \
   --test_all_data_in_one_period=1 \
   2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "semantic_role_labeling_train" >/dev/null 2>&1
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
index 00f72cecacb454a0dd1184fa2098be4543007de7..4b7f5d0e504aef3884a04cbed8c16503a4079772 100755
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -32,4 +32,6 @@ def process(settings, file_name):
             word_slot = [
                 settings.word_dict[w] for w in words if w in settings.word_dict
             ]
+            if not word_slot:
+                continue
             yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index 8ec490f64691924013200a3d0038d39aa834b038..64c78e0d6b9297e7a321a4f070517593b0bfe332 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -138,7 +138,11 @@ def main():
 
     batch = []
     for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
+        if words:
+            batch.append([words])
+        else:
+            print('All the words in [%s] are not in the dictionary.' % line)
         if len(batch) == batch_size:
             predict.batch_predict(batch)
             batch = []
diff --git a/demo/sentiment/test.sh b/demo/sentiment/test.sh
index 8af827c3388c8df88a872bd87d121a4f9631c3ff..85c4f3ccfc3ede23fcf701769b9701ecbf57c789 100755
--- a/demo/sentiment/test.sh
+++ b/demo/sentiment/test.sh
@@ -37,3 +37,4 @@ paddle train --config=$net_conf \
              --trainer_count=4 \
              --config_args=is_test=1 \
              2>&1 | tee 'test.log'
+paddle usage -l test.log -e $? -n "sentiment_test" >/dev/null 2>&1
diff --git a/demo/sentiment/train.sh b/demo/sentiment/train.sh
index 5ce8bf4b997d962b9b61593cec0954d76c4874bc..14620f733bf03444e5ba3b3b792dfbed6146ecde 100755
--- a/demo/sentiment/train.sh
+++ b/demo/sentiment/train.sh
@@ -27,3 +27,4 @@ paddle train --config=$config \
              --show_parameter_stats_period=100 \
              --test_all_data_in_one_period=1 \
              2>&1 | tee 'train.log'
+paddle usage -l train.log -e $? -n "sentiment_train" >/dev/null 2>&1
diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd7243cbe69977dcabc9ecf1d060e62f313b8cfd
--- /dev/null
+++ b/demo/sentiment/train_v2.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.trainer_config_helpers.attrs as attrs
+from paddle.trainer_config_helpers.poolings import MaxPooling
+import paddle.v2 as paddle
+
+
+def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    is_predict: is predicting or not.
+                Some layers is not needed in network when predicting.
+    """
+    assert stacked_num % 2 == 1
+
+    layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
+    fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
+    lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling())
+    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+if __name__ == '__main__':
+    # init
+    paddle.init(use_gpu=False, trainer_count=4)
+
+    # network config
+    print 'load dictionary...'
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    # Please choose the way to build the network
+    # by uncommenting the corresponding line.
+    cost = convolution_net(dict_dim, class_dim=class_dim)
+    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=2e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.batch(
+                    lambda: paddle.dataset.imdb.test(word_dict),
+                    batch_size=128),
+                feeding={'word': 0,
+                         'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+            batch_size=100),
+        event_handler=event_handler,
+        feeding={'word': 0,
+                 'label': 1},
+        num_passes=10)
diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6efd254e7a48703a69c9f09dd35d41ba7ac5689a
--- /dev/null
+++ b/demo/seqToseq/api_train_v2.py
@@ -0,0 +1,140 @@
+import paddle.v2 as paddle
+
+
+def seqToseq_net(source_dict_dim, target_dict_dim):
+    ### Network Architecture
+    word_vector_dim = 512  # dimension of word vector
+    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
+    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
+
+    #### Encoder
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    #### Decoder
+    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
+        encoded_proj += paddle.layer.full_matrix_projection(
+            input=encoded_vector)
+
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    with paddle.layer.mixed(
+            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
+        decoder_boot += paddle.layer.full_matrix_projection(
+            input=backward_first)
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        with paddle.layer.mixed(
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+
+    trg_embedding = paddle.layer.embedding(
+        input=paddle.layer.data(
+            name='target_language_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
+    # For decoder equipped with attention mechanism, in training,
+    # target embeding (the groudtruth) is the data input,
+    # while encoded source sequence is accessed to as an unbounded memory.
+    # Here, the StaticInput defines a read-only memory
+    # for the recurrent_group.
+    decoder = paddle.layer.recurrent_group(
+        name=decoder_group_name,
+        step=gru_decoder_with_attention,
+        input=group_inputs)
+
+    lbl = paddle.layer.data(
+        name='target_language_next_word',
+        type=paddle.data_type.integer_value_sequence(target_dict_dim))
+    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+    return cost
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # source and target dict dim.
+    dict_size = 30000
+    source_dict_dim = target_dict_dim = dict_size
+
+    # define network topology
+    cost = seqToseq_net(source_dict_dim, target_dict_dim)
+    parameters = paddle.parameters.create(cost)
+
+    # define optimize method and trainer
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # define data reader
+    feeding = {
+        'source_language_word': 0,
+        'target_language_word': 1,
+        'target_language_next_word': 2
+    }
+
+    wmt14_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
+        batch_size=5)
+
+    # define event_handler callback
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 10 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    # start to train
+    trainer.train(
+        reader=wmt14_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        feeding=feeding)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/seqToseq/paraphrase/train.sh b/demo/seqToseq/paraphrase/train.sh
index 33a42f6eff2b0414c466d5f78c89989a6a517eb9..9bb6dbdb1d4c5e35bfb31855e0331f0250a69a20 100755
--- a/demo/seqToseq/paraphrase/train.sh
+++ b/demo/seqToseq/paraphrase/train.sh
@@ -27,3 +27,4 @@ paddle train \
     --log_period=10 \
     --dot_period=5 \
     2>&1 | tee 'paraphrase/train.log'
+paddle usage -l 'paraphrase/train.log' -e $? -n "seqToseq_paraphrase_train" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/gen.sh b/demo/seqToseq/translation/gen.sh
index a700ae213473dfe7c5b77156de15775b8fe9a9f0..64b78f5e9654e7b206740f92e224e0164108c9f1 100755
--- a/demo/seqToseq/translation/gen.sh
+++ b/demo/seqToseq/translation/gen.sh
@@ -24,3 +24,4 @@ paddle train \
     --test_pass=12 \
     --trainer_count=1 \
     2>&1 | tee 'translation/gen.log'
+paddle usage -l 'translation/gen.log' -e $? -n "seqToseq_translation_gen" >/dev/null 2>&1
diff --git a/demo/seqToseq/translation/train.sh b/demo/seqToseq/translation/train.sh
index bdece693e5c407c89bc172c461bac7f9b20560d3..b0ec9854b118cbb9ed39d6bed0cdd845403926a4 100755
--- a/demo/seqToseq/translation/train.sh
+++ b/demo/seqToseq/translation/train.sh
@@ -25,3 +25,4 @@ paddle train \
 --log_period=10 \
 --dot_period=5 \
 2>&1 | tee 'translation/train.log'
+paddle usage -l 'translation/train.log' -e $? -n "seqToseq_translation_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train.sh b/demo/sequence_tagging/train.sh
index 9a706b98d8686101ba21b513644bdd791062ec26..37e196c84200dc26ccb523076a81dbc393b1280f 100755
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
@@ -7,4 +7,6 @@ paddle train \
        --dot_period=10 \
        --log_period=1000 \
        --test_period=0 \
-       --num_passes=10
+       --num_passes=10 \
+2>&1 | tee 'train.log'
+paddle usage -l 'train.log' -e $? -n "sequence_tagging_train" >/dev/null 2>&1
diff --git a/demo/sequence_tagging/train_linear.sh b/demo/sequence_tagging/train_linear.sh
index 597b5afea9c63a8e209b69b6a40e74556e27ac31..ad6e2d8ee7f813c69f9dd250c6f7bbb4403a0ed5 100755
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
@@ -7,3 +7,5 @@ paddle train \
        --log_period=10000 \
        --test_period=0 \
        --num_passes=10
+2>&1 | tee 'train_linear.log'
+paddle usage -l 'train_linear.log' -e $? -n "sequence_tagging_train_linear" >/dev/null 2>&1
diff --git a/demo/traffic_prediction/README b/demo/traffic_prediction/README
new file mode 100644
index 0000000000000000000000000000000000000000..4c95188583513c332b7d7cb0a32d59336208e1aa
--- /dev/null
+++ b/demo/traffic_prediction/README
@@ -0,0 +1,7 @@
+run by:
+cd ./data
+sh get_data.sh
+cd ..
+sh train.sh
+sh predict.sh
+
diff --git a/demo/traffic_prediction/data/get_data.sh b/demo/traffic_prediction/data/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..f2fa548d4709c0361334f117bfb49e18d83c32f4
--- /dev/null
+++ b/demo/traffic_prediction/data/get_data.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -x
+
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+
+#download the dataset
+echo "Downloading traffic data..."
+wget http://paddlepaddle.cdn.bcebos.com/demo/traffic/traffic_data.tar.gz
+
+#extract package
+echo "Unzipping..."
+tar -zxvf traffic_data.tar.gz
+
+echo "data/speeds.csv" > train.list
+echo "data/speeds.csv" > test.list
+echo "data/speeds.csv" > pred.list
+
+echo "Done."
diff --git a/demo/traffic_prediction/dataprovider.py b/demo/traffic_prediction/dataprovider.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7883b6950c369ee67c39b80ce1cefbbf9350459
--- /dev/null
+++ b/demo/traffic_prediction/dataprovider.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import sys
+import numpy as np
+TERM_NUM = 24
+FORECASTING_NUM = 24
+LABEL_VALUE_NUM = 4
+
+
+def initHook(settings, file_list, **kwargs):
+    """
+    Init hook is invoked before process data. It will set obj.slots and store data meta.
+
+    :param settings: global object. It will passed to process routine.
+    :type obj: object
+    :param file_list: the meta file object, which passed from trainer_config.py,but unused in this function.
+    :param kwargs: unused other arguments.
+    """
+    del kwargs  #unused 
+
+    settings.pool_size = sys.maxint
+    #Use a time seires of the past as feature.
+    #Dense_vector's expression form is [float,float,...,float]
+    settings.input_types = [dense_vector(TERM_NUM)]
+    #There are next FORECASTING_NUM fragments you need predict.
+    #Every predicted condition at time point has four states.
+    for i in range(FORECASTING_NUM):
+        settings.input_types.append(integer_value(LABEL_VALUE_NUM))
+
+
+@provider(
+    init_hook=initHook, cache=CacheType.CACHE_PASS_IN_MEM, should_shuffle=True)
+def process(settings, file_name):
+    with open(file_name) as f:
+        #abandon fields name
+        f.next()
+        for row_num, line in enumerate(f):
+            speeds = map(int, line.rstrip('\r\n').split(",")[1:])
+            # Get the max index.
+            end_time = len(speeds)
+            # Scanning and generating samples
+            for i in range(TERM_NUM, end_time - FORECASTING_NUM):
+                # For dense slot
+                pre_spd = map(float, speeds[i - TERM_NUM:i])
+
+                # Integer value need predicting, values start from 0, so every one minus 1.
+                fol_spd = [j - 1 for j in speeds[i:i + FORECASTING_NUM]]
+
+                # Predicting label is missing, abandon the sample.
+                if -1 in fol_spd:
+                    continue
+                yield [pre_spd] + fol_spd
+
+
+def predict_initHook(settings, file_list, **kwargs):
+    settings.pool_size = sys.maxint
+    settings.input_types = [dense_vector(TERM_NUM)]
+
+
+@provider(init_hook=predict_initHook, should_shuffle=False)
+def process_predict(settings, file_name):
+    with open(file_name) as f:
+        #abandon fields name
+        f.next()
+        for row_num, line in enumerate(f):
+            speeds = map(int, line.rstrip('\r\n').split(","))
+            end_time = len(speeds)
+            pre_spd = map(float, speeds[end_time - TERM_NUM:end_time])
+            yield pre_spd
diff --git a/demo/traffic_prediction/gen_result.py b/demo/traffic_prediction/gen_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..3da70b30315f863fd3582583e9a29540a09c1e7f
--- /dev/null
+++ b/demo/traffic_prediction/gen_result.py
@@ -0,0 +1,61 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+res = []
+with open('./rank-00000') as f:
+    for line in f:
+        pred = map(int, line.strip('\r\n;').split(";"))
+        #raw prediction range from 0 to 3
+        res.append([i + 1 for i in pred])
+
+file_name = open('./data/pred.list').read().strip('\r\n')
+
+FORECASTING_NUM = 24
+header = [
+    'id',
+    '201604200805',
+    '201604200810',
+    '201604200815',
+    '201604200820',
+    '201604200825',
+    '201604200830',
+    '201604200835',
+    '201604200840',
+    '201604200845',
+    '201604200850',
+    '201604200855',
+    '201604200900',
+    '201604200905',
+    '201604200910',
+    '201604200915',
+    '201604200920',
+    '201604200925',
+    '201604200930',
+    '201604200935',
+    '201604200940',
+    '201604200945',
+    '201604200950',
+    '201604200955',
+    '201604201000',
+]
+###################
+## To CSV format ##
+###################
+with open(file_name) as f:
+    f.next()
+    print ','.join(header)
+    for row_num, line in enumerate(f):
+        fields = line.rstrip('\r\n').split(',')
+        linkid = fields[0]
+        print linkid + ',' + ','.join(map(str, res[row_num]))
diff --git a/demo/traffic_prediction/predict.sh b/demo/traffic_prediction/predict.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2dbd5e8805dd97d35c7d58917f8ec6b5033bda03
--- /dev/null
+++ b/demo/traffic_prediction/predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.py
+# pass choice 
+model="output/pass-00000"
+paddle train \
+    --config=$cfg \
+    --use_gpu=false \
+    --job=test \
+    --init_model_path=$model \
+    --config_args=is_predict=1 \
+    --predict_output_dir=. 
+
+python gen_result.py > result.csv
+
+rm -rf rank-00000
diff --git a/demo/traffic_prediction/train.sh b/demo/traffic_prediction/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..48dfc5604f80042598c5c779bd450a5808fdfb64
--- /dev/null
+++ b/demo/traffic_prediction/train.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+cfg=trainer_config.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=1000 \
+  --dot_period=10 \
+  --num_passes=10 \
+  --use_gpu=false \
+  --show_parameter_stats_period=3000 \
+  2>&1 | tee 'train.log'
diff --git a/demo/traffic_prediction/trainer_config.py b/demo/traffic_prediction/trainer_config.py
new file mode 100755
index 0000000000000000000000000000000000000000..52d678624aff7ca2264c3c20e320004217d14397
--- /dev/null
+++ b/demo/traffic_prediction/trainer_config.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2016 PaddlePaddle Authors, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+
+################################### DATA Configuration #############################################
+is_predict = get_config_arg('is_predict', bool, False)
+trn = './data/train.list' if not is_predict else None
+tst = './data/test.list' if not is_predict else './data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(
+    train_list=trn, test_list=tst, module="dataprovider", obj=process)
+################################### Parameter Configuaration #######################################
+TERM_NUM = 24
+FORECASTING_NUM = 24
+emb_size = 16
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=1e-3,
+    learning_method=RMSPropOptimizer())
+################################### Algorithm Configuration ########################################
+
+output_label = []
+
+link_encode = data_layer(name='link_encode', size=TERM_NUM)
+for i in xrange(FORECASTING_NUM):
+    # Each task share same weight.
+    link_param = ParamAttr(
+        name='_link_vec.w', initial_max=1.0, initial_min=-1.0)
+    link_vec = fc_layer(input=link_encode, size=emb_size, param_attr=link_param)
+    score = fc_layer(input=link_vec, size=4, act=SoftmaxActivation())
+    if is_predict:
+        maxid = maxid_layer(score)
+        output_label.append(maxid)
+    else:
+        # Multi-task training.
+        label = data_layer(name='label_%dmin' % ((i + 1) * 5), size=4)
+        cls = classification_cost(
+            input=score, name="cost_%dmin" % ((i + 1) * 5), label=label)
+        output_label.append(cls)
+outputs(output_label)
diff --git a/demo/word2vec/train_v2.py b/demo/word2vec/train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d952b446f9db432062fc3305a6b65b0ad66dd47
--- /dev/null
+++ b/demo/word2vec/train_v2.py
@@ -0,0 +1,80 @@
+import math
+
+import paddle.v2 as paddle
+
+dictsize = 1953
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0, ))
+    return wordemb
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..fca981221e490686e468ae8d385d844d49767883 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -1,37 +1,26 @@
-API中文手册
-============
+API
+===
 
-DataProvider API
-----------------
+模型配置 API
+------------
 
 ..  toctree::
     :maxdepth: 1
 
-    data_provider/dataprovider_cn.rst
-    data_provider/pydataprovider2_cn.rst
+    v2/model_configs.rst
 
-..  _api_trainer_config:
-
-Model Config API
-----------------
+数据 API
+--------
 
 ..  toctree::
     :maxdepth: 1
 
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
+    v2/data.rst
 
-Applications API
-----------------
+训练 API
+--------
 
-..  toctree::
-    :maxdepth: 1
+..	toctree::
+	:maxdepth: 1
 
-    predict/swig_py_paddle_cn.rst
+	v2/run_logic.rst
\ No newline at end of file
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 10c297a71d6988c002de868e804ed9ee2345fbd7..f0ad0fb2aee7345db1dd5f175a342598366f5e3c 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -1,37 +1,26 @@
 API
 ===
 
-DataProvider API
+Model Config API
 ----------------
 
 ..  toctree::
     :maxdepth: 1
 
-    data_provider/dataprovider_en.rst
-    data_provider/pydataprovider2_en.rst
-
-..  _api_trainer_config:
+    v2/model_configs.rst
 
-Model Config API
-----------------
+Data API
+--------
 
 ..  toctree::
     :maxdepth: 1
 
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
+    v2/data.rst
 
+Train API
+---------
 
-Applications API
-----------------
-
-..  toctree::
-    :maxdepth: 1
+..	toctree::
+	:maxdepth: 1
 
-    predict/swig_py_paddle_en.rst
+	v2/run_logic.rst
\ No newline at end of file
diff --git a/doc/api/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst
similarity index 100%
rename from doc/api/data_provider/dataprovider_cn.rst
rename to doc/api/v1/data_provider/dataprovider_cn.rst
diff --git a/doc/api/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst
similarity index 100%
rename from doc/api/data_provider/dataprovider_en.rst
rename to doc/api/v1/data_provider/dataprovider_en.rst
diff --git a/doc/api/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst
similarity index 100%
rename from doc/api/data_provider/pydataprovider2_cn.rst
rename to doc/api/v1/data_provider/pydataprovider2_cn.rst
diff --git a/doc/api/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst
similarity index 100%
rename from doc/api/data_provider/pydataprovider2_en.rst
rename to doc/api/v1/data_provider/pydataprovider2_en.rst
diff --git a/doc/api/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
similarity index 100%
rename from doc/api/data_provider/src/mnist_config.py
rename to doc/api/v1/data_provider/src/mnist_config.py
diff --git a/doc/api/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
similarity index 100%
rename from doc/api/data_provider/src/mnist_provider.dict.py
rename to doc/api/v1/data_provider/src/mnist_provider.dict.py
diff --git a/doc/api/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt
similarity index 100%
rename from doc/api/data_provider/src/mnist_train.txt
rename to doc/api/v1/data_provider/src/mnist_train.txt
diff --git a/doc/api/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
similarity index 100%
rename from doc/api/data_provider/src/sentimental_config.py
rename to doc/api/v1/data_provider/src/sentimental_config.py
diff --git a/doc/api/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
similarity index 100%
rename from doc/api/data_provider/src/sentimental_provider.py
rename to doc/api/v1/data_provider/src/sentimental_provider.py
diff --git a/doc/api/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt
similarity index 100%
rename from doc/api/data_provider/src/sentimental_train.txt
rename to doc/api/v1/data_provider/src/sentimental_train.txt
diff --git a/doc/api/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list
similarity index 100%
rename from doc/api/data_provider/src/train.list
rename to doc/api/v1/data_provider/src/train.list
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3718cd73a2003b8ef6c406a9bd51dc68e76402dc
--- /dev/null
+++ b/doc/api/v1/index_cn.rst
@@ -0,0 +1,37 @@
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10c297a71d6988c002de868e804ed9ee2345fbd7
--- /dev/null
+++ b/doc/api/v1/index_en.rst
@@ -0,0 +1,37 @@
+API
+===
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_en.rst
+    data_provider/pydataprovider2_en.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_en.rst
diff --git a/doc/api/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py
similarity index 100%
rename from doc/api/predict/src/predict_sample.py
rename to doc/api/v1/predict/src/predict_sample.py
diff --git a/doc/api/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst
similarity index 100%
rename from doc/api/predict/swig_py_paddle_cn.rst
rename to doc/api/v1/predict/swig_py_paddle_cn.rst
diff --git a/doc/api/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst
similarity index 100%
rename from doc/api/predict/swig_py_paddle_en.rst
rename to doc/api/v1/predict/swig_py_paddle_en.rst
diff --git a/doc/api/trainer_config_helpers/activations.rst b/doc/api/v1/trainer_config_helpers/activations.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/activations.rst
rename to doc/api/v1/trainer_config_helpers/activations.rst
diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/v1/trainer_config_helpers/attrs.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/attrs.rst
rename to doc/api/v1/trainer_config_helpers/attrs.rst
diff --git a/doc/api/trainer_config_helpers/data_sources.rst b/doc/api/v1/trainer_config_helpers/data_sources.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/data_sources.rst
rename to doc/api/v1/trainer_config_helpers/data_sources.rst
diff --git a/doc/api/trainer_config_helpers/evaluators.rst b/doc/api/v1/trainer_config_helpers/evaluators.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/evaluators.rst
rename to doc/api/v1/trainer_config_helpers/evaluators.rst
diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
similarity index 93%
rename from doc/api/trainer_config_helpers/layers.rst
rename to doc/api/v1/trainer_config_helpers/layers.rst
index 4e429650e545179eca2f947e4af660222ad7cda8..bbea823de4d870f8a4384b6a85ebb7e8182797fe 100644
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -139,24 +139,12 @@ lstmemory
     :members: lstmemory
     :noindex:
 
-lstm_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lstm_step_layer
-    :noindex:
-
 grumemory
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: grumemory
     :noindex:
 
-gru_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: gru_step_layer
-    :noindex:
-
 Recurrent Layer Group
 =====================
 
@@ -172,6 +160,18 @@ recurrent_group
     :members: recurrent_group
     :noindex:
     
+lstm_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lstm_step_layer
+    :noindex:
+
+gru_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: gru_step_layer
+    :noindex:
+
 beam_search
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -279,6 +279,12 @@ concat_layer
     :members: concat_layer
     :noindex:
 
+seq_concat_layer
+----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_concat_layer
+    :noindex:
+
 Reshaping Layers
 ================
 
@@ -302,6 +308,18 @@ repeat_layer
     :members: repeat_layer
     :noindex:
 
+rotate_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: rotate_layer
+    :noindex:
+
+seq_reshape_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_reshape_layer
+    :noindex:
+
 Math Layers
 ===========
 
@@ -382,6 +400,15 @@ sampling_id_layer
     :members: sampling_id_layer
     :noindex:
 
+Slicing and Joining Layers
+==========================
+
+pad_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: pad_layer
+    :noindex:
+
 ..  _api_trainer_config_helpers_layers_cost_layers:
 
 Cost Layers
@@ -441,6 +468,12 @@ ctc_layer
     :members: ctc_layer
     :noindex:
 
+warp_ctc_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: warp_ctc_layer
+    :noindex:
+
 nce_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc/api/trainer_config_helpers/networks.rst b/doc/api/v1/trainer_config_helpers/networks.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/networks.rst
rename to doc/api/v1/trainer_config_helpers/networks.rst
diff --git a/doc/api/trainer_config_helpers/optimizers.rst b/doc/api/v1/trainer_config_helpers/optimizers.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/optimizers.rst
rename to doc/api/v1/trainer_config_helpers/optimizers.rst
diff --git a/doc/api/trainer_config_helpers/poolings.rst b/doc/api/v1/trainer_config_helpers/poolings.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/poolings.rst
rename to doc/api/v1/trainer_config_helpers/poolings.rst
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1c0a202a8c04322de3e9533c11fb5c74abac6c62
--- /dev/null
+++ b/doc/api/v2/data.rst
@@ -0,0 +1,93 @@
+================
+Data Related API
+================
+
+
+#########
+DataTypes
+#########
+
+..  automodule:: paddle.v2.data_type
+    :members:
+
+##########
+DataFeeder
+##########
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+
+######
+Reader
+######
+
+..  automodule:: paddle.v2.reader
+    :members:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+
+#########
+minibatch
+#########
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+
+#######
+Dataset
+#######
+
+..  automodule:: paddle.v2.dataset
+    :members:
+
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+
diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e9cd3d5bf7b0e9e59c231bcabdb163a740909de1
--- /dev/null
+++ b/doc/api/v2/model_configs.rst
@@ -0,0 +1,46 @@
+#########################
+Configuration Related API
+#########################
+
+======
+Layers
+======
+
+..  automodule:: paddle.v2.layer
+    :members:
+
+
+==========
+Attributes
+==========
+
+..	automodule:: paddle.v2.attr
+	:members:
+
+===========
+Activations
+===========
+
+..	automodule:: paddle.v2.activation
+	:members:
+
+========
+Poolings
+========
+
+..	automodule:: paddle.v2.pooling
+	:members:
+
+========
+Networks
+========
+
+..	automodule:: paddle.v2.networks
+	:members:
+
+==========
+Optimizers
+==========
+
+..	automodule:: paddle.v2.optimizer
+	:members:
diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst
new file mode 100644
index 0000000000000000000000000000000000000000..904d45966dfc16a474016ff48fd5a951988b0ab0
--- /dev/null
+++ b/doc/api/v2/run_logic.rst
@@ -0,0 +1,26 @@
+###########
+Trainer API
+###########
+
+==========
+Parameters
+==========
+
+..  automodule:: paddle.v2.parameters
+    :members:
+
+
+=======
+Trainer
+=======
+
+..	automodule:: paddle.v2.trainer
+	:members:
+
+
+=====
+Event
+=====
+
+..	automodule:: paddle.v2.event
+	:members:
diff --git a/doc/design/api.md b/doc/design/api.md
new file mode 100644
index 0000000000000000000000000000000000000000..8185d2af0ea264a2e7b4e28b9ed05279e4a22014
--- /dev/null
+++ b/doc/design/api.md
@@ -0,0 +1,262 @@
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems at neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
+this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code if `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f21f7af520df5171798326818ecb97c3bcd14a12
--- /dev/null
+++ b/doc/design/reader/README.md
@@ -0,0 +1,202 @@
+# Python Data Reader Design Doc
+
+At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+
+```
+iterable = data_reader()
+```
+
+Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+
+Here are valid outputs:
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],),
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three column of datas, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It's easy to convert from reader to batch reader:
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+Also easy to create custom batch reader:
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+
+Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why reader return only a single entry, but not a mini batch?
+
+Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+
+We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+
+### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+
+In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+
+### Why use a dictionary but not a list to provide mapping?
+
+We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create custom data reader creator
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train could be:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 7d425a05d46131d84ba895d0fefc3a592a9a36e1..6d5367177da2af6276698f94f86664a5b506dca2 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,22 +286,3 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
-
-12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
----------------------------------------------------
-
-目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
-:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
-
-..  code-block:: bash
-
-    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
-
-来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
-
-..  code-block:: bash
-
-    git submodule init
-    git submodule update
-
-来获得所有第三方模块。
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index aaa07d49d3148266db27670a98c2b27db4dc0a8f..d9d54bff3096cb3520409971dbd1b2e179ac8be1 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Centos](#centos)
+
 
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@@ -11,32 +13,22 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
-git submodule update --init --recursive
-```
-
-If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
-
-If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
 ```
-git submodule update --remote
-```
-
 ## <span id="requirements">Requirements</span>
 
 To compile the source code, your computer must be equipped with the following dependencies.
 
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1)
-- **CMake**: version >= 2.8
+- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
+- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
-- **Protocol Buffers**: version >= 2.4, **Note: 3.x is not supported**
-- **Python**: only python 2.7 is supported currently
+- **Python**: only support Python 2.7
 
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
 
 ### Options
 
-PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+PaddlePaddle supports some build options. 
 
 <html>
 <table> 
@@ -47,12 +39,21 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 </tr>
 </thead>
 <tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
+<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
+<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
+<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
+<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
+<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
+<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
+<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
+<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
+<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
+<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
+<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
+<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
+<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
 </table>
 </html>
@@ -64,18 +65,16 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 
 As a simple example, consider the following:  
 
-1. **Python Dependencies(optional)**
+1. **BLAS Dependencies(optional)**
   
-    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
-    # install swig on ubuntu
-    sudo apt-get install swig
-    # install swig on Mac OS X
-    brew install swig
-
-    # active swig in cmake
-    cmake .. -DWITH_SWIG_PY=ON
+    # specify MKL
+    cmake .. -DMKL_ROOT=<mkl_path>
+    # or specify OpenBLAS
+    cmake .. -DOPENBLAS_ROOT=<openblas_path>
     ```
 
 2. **Doc Dependencies(optional)**
@@ -99,24 +98,21 @@ As a simple example, consider the following:
 
 ### Install Dependencies
 
-- **CPU Dependencies**
+- **Paddle Dependencies**
 
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake swig build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
-    # optional
-    sudo apt-get install libgoogle-glog-dev
-    sudo apt-get install libgflags-dev
-    sudo apt-get install libgtest-dev
-    sudo pip install wheel
-    pushd /usr/src/gtest
-    cmake .
-    make
-    sudo cp *.a /usr/lib
-    popd
+    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
+    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
+    sudo pip install 'protobuf==3.1.0.post1'
+
+    # install cmake 3.4
+    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
+        cd .. && rm -rf cmake-3.4.1
     ```
-  
+
 - **GPU Dependencies (optional)**
 
     To build GPU version, you will need the following installed:
@@ -149,51 +145,78 @@ As usual, the best option is to create build folder under paddle project directo
 
 ```bash
 mkdir build && cd build
-cmake ..
+``` 
+
+Finally, you can build and install PaddlePaddle:
+
+```bash
+# you can add build option here, such as:    
+cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
+# install PaddlePaddle Python modules.
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+## <span id="centos">Build on Centos 7</span>
 
-CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
-libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
-If still not found, you can manually set it based on CMake error information from your screen.
+### Install Dependencies
 
-As a simple example, consider the following:
+- **CPU Dependencies**
 
-- **Only CPU with swig**
+    ```bash
+    # necessary
+    sudo yum update
+    sudo yum install -y epel-release
+    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
+    ```
+  
+- **GPU Dependencies (optional)**
 
-  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON
-  ```
-- **GPU with swig**
+    To build GPU version, you will need the following installed:
 
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON
-  ```
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
 
-- **GPU with doc and swig**
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
 
-  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-  ``` 
+```bash
+mkdir build && cd build
+``` 
 
-Finally, you can build PaddlePaddle:
+Finally, you can build and install PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install> -DWITH_SWIG_PY=ON
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<path to install>/bin:$PATH
-```
-
-If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
-Otherwise, PaddlePaddle will automatically install python dependencies
-at first time when user run paddle commands, such as `paddle version`, `paddle train`.
-It may require sudo privileges:
-
-```bash
-# you can run
+# install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-# or just run 
-sudo paddle version
 ```
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
index 3a52c8723bbccd70dd89e8913092d92813925f90..be0c1ffa451b2901ec06621dd4d886f800b4562e 100644
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -40,4 +40,4 @@ PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。
 
     cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
 
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
+注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 35234e0eb3ece3cb20d62841c1d75e60b485b9ea..6b132d2a4d31ab85347bd41d0243ffee858ac909 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -12,7 +12,7 @@ PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Dock
 PaddlePaddle提供的Docker镜像版本
 --------------------------------
 
-我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddle-dev/paddle` ，tag分别为
+我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddledev/paddle` ，tag分别为
 
 +-----------------+------------------+------------------------+-----------------------+
 |                 |   normal         |           devel        |          demo         |
@@ -45,7 +45,7 @@ PaddlePaddle提供的Docker镜像版本
 
     if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
 
-如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddle-dev/paddle:cpu-devel-latest` 来引用这个image。
+如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddledev/paddle:cpu-devel-latest` 来引用这个image。
 
 PaddlePaddle提供的镜像并不包含任何命令运行，想要运行PaddlePaddle，您需要进入镜像运行PaddlePaddle
 程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 34279a29b2e4c84aa5039f2e5ab2c6ed9a06da2f..5a1056e859a0c977c9cd365ff1e4ffe58596f41f 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,80 +16,71 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.
 
-The general development workflow with Docker and Bazel is as follows:
-
-1. Get the source code of Paddle:
+1. Build the Development Environment as a Docker Image
 
    .. code-block:: bash
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+      git clone --recursive https://github.com/PaddlePaddle/Paddle
+      cd Paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
 
-   
-   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
 
-   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
-   empty, please use the following command to get the submodule.
+   Note that by default :code:`docker build` wouldn't import source
+   tree into the image and build it.  If we want to do that, we need
+   to set a build arg:
 
    .. code-block:: bash
 
-      git submodule update --init --recursive
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+
 
+2. Run the Development Environment
 
-2. Build a development Docker image :code:`paddle:dev` from the source
-   code.  This image contains all the development tools and
-   dependencies of PaddlePaddle.
+   Once we got the image :code:`paddle:dev`, we can use it to develop
+   Paddle by mounting the local source code tree into a container that
+   runs the image:
 
    .. code-block:: bash
 
-      cd paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
 
-   Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below.
+   This runs a container of the development environment Docker image
+   with the local source tree mounted to :code:`/paddle` of the
+   container.
 
-   .. code-block:: bash
+   Note that the default entry-point of :code:`paddle:dev` is
+   :code:`sshd`, and above :code:`docker run` commands actually starts
+   an SSHD server listening on port 2202.  This allows us to log into
+   this container with:
 
-      docker build \
-       --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
-       -t paddle:dev \
-       -f paddle/scripts/docker/Dockerfile .
+   .. code-block:: bash
 
+      ssh root@localhost -p 2202
 
-3. Run the image as a container and mounting local source code
-   directory into the container.  This allows us to change the code on
-   the host and build it within the container.
+   Usually, I run above commands on my Mac.  I can also run them on a
+   GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:
 
    .. code-block:: bash
 
-      docker run       \
-       -d              \
-       --name paddle   \
-       -p 2022:22      \
-       -v $PWD:/paddle \
-       paddle:dev
+      my-mac$ ssh root@xxx.yyy.zzz.www -p 2202
 
-   where :code:`-d` makes the container running in background,
-   :code:`--name paddle` allows us to run a nginx container to serve
-   documents in this container, :code:`-p 2022:22` allows us to SSH
-   into this container, :code:`-v $PWD:/paddle` shares the source code
-   on the host with the container.
+3. Build and Install Using the Development Environment
 
-4. SSH into the container:
+   Once I am in the container, I can use
+   :code:`paddle/scripts/docker/build.sh` to build, install, and test
+   Paddle:
 
    .. code-block:: bash
 
-      ssh root@localhost -p 2022
+      /paddle/paddle/scripts/docker/build.sh
 
-5. We can edit the source code in the container or on this host.  Then
-   we can build using cmake
+   This builds everything about Paddle in :code:`/paddle/build`.  And
+   we can run unit tests there:
 
    .. code-block:: bash
 
-      cd /paddle # where paddle source code has been mounted into the container
-      mkdir -p build
-      cd build
-      cmake -DWITH_TESTING=ON ..
-      make -j `nproc`
-      CTEST_OUTPUT_ON_FAILURE=1 ctest
+      cd /paddle/build
+      ctest
 
 
 CPU-only and GPU Images
@@ -162,7 +153,6 @@ source code:
    cd ~
    git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
-   git submodule update --init --recursive
    docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
    docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
 
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 943b1d4bb84646d9f60de7790be166a83d10b1e0..4b328fc9d38bc5dfec35d5e0f0d46136aeeb41bc 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
         
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
 
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
@@ -54,7 +54,7 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
         last = last_seq(input=layer,
                         agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
+  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_cn.md b/doc/howto/deep_model/rnn/rnn_cn.md
deleted file mode 100644
index 5ec05b2cab9ba85f9f6e9644375ee14f647a413c..0000000000000000000000000000000000000000
--- a/doc/howto/deep_model/rnn/rnn_cn.md
+++ /dev/null
@@ -1,226 +0,0 @@
-RNN 配置
-=================
-
-本教程将指导你如何在 PaddlePaddle 中配置循环神经网络（RNN）。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-
--   准备用来学习循环神经网络的序列数据。
--   配置循环神经网络架构。
--   使用学习完成的循环神经网络模型生成序列。
-
-我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。
-
-准备序列数据
----------------------
-
-PaddlePaddle 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。 它们都是序列，它们的大小是`src_dict`，`trg_dict`和`trg_dict`：
-
-``` sourceCode
-settings.input_types = [
-  integer_value_sequence(len(settings.src_dict)),
-  integer_value_sequence(len(settings.trg_dict)),
-  integer_value_sequence(len(settings.trg_dict))]
-```
-
-在`process`函数中，每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-``` sourceCode
-yield src_ids, trg_ids, trg_ids_next
-```
-
-有关如何编写数据提供程序的更多细节描述，请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`。
-
-配置循环神经网络架构
------------------------------------------------
-
-### 简单门控循环神经网络(Gated Recurrent Neural Network)
-
-循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
-
-![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg)
-
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* = 1 执行以下操作。
-
-*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>),*y*<sub>*t*</sub> = *f*<sub>*y*</sub>(*x*<sub>*t*</sub>)
-
-其中 *f*<sub>*x*</sub>(.) 称为**单步函数**（即单时间步执行的函数，step function），而 *f*<sub>*y*</sub>(.) 称为**输出函数**。在 vanilla 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用`grumemory`和`lstmemory`，因为它们的计算效率比`recurrent_group`更高。
-
-对于 vanilla RNN，在每个时间步长，**单步函数**为：
-
-*x*<sub>*t* + 1</sub> = *W*<sub>*x*</sub>*x*<sub>*t*</sub> + *W*<sub>*i*</sub>*I*<sub>*t*</sub> + *b*
-
-其中 *x*<sub>*t*</sub> 是RNN状态，并且 *I*<sub>*t*</sub> 是输入，*W*<sub>*x*</sub> 和 *W*<sub>*i*</sub> 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x*<sub>*t*</sub>作为输出。
-
-`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数**，**输出函数**和循环神经网络的输入。注意，这个函数的`step`参数需要实现`step function`（单步函数）和`output function`（输出函数）：
-
-
-``` sourceCode
-def simple_rnn(input,
-               size=None,
-               name=None,
-               reverse=False,
-               rnn_bias_attr=None,
-               act=None,
-               rnn_layer_attr=None):
-    def __rnn_step__(ipt):
-       out_mem = memory(name=name, size=size)
-       rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                      full_matrix_projection(out_mem)],
-                             name = name,
-                             bias_attr = rnn_bias_attr,
-                             act = act,
-                             layer_attr = rnn_layer_attr,
-                             size = size)
-       return rnn_out
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__rnn_step__,
-                           reverse=reverse,
-                           input=input)
-```
-
-PaddlePaddle 使用“Memory”（记忆模块）实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态，例如*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>)。 一个Memory包含**输出**和**输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**，其输出被用作Memory的初始值。 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x*<sub>*t* + 1</sub>)的输出被用作`out_mem`Memory的**输出**。
-
-Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
-
-我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。
-
-### Sequence to Sequence Model with Attention
-
-我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
-
-![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png)
-
-在这个模型中，源序列 *S* = {*s*<sub>1</sub>, …, *s*<sub>*T*</sub>} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H*<sub>*S*</sub> = {*H*<sub>1</sub>, …, *H*<sub>*T*</sub>} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y*<sub>*t*</sub>时, 这个门控循环神经网络生成一系列权重 *W*<sub>*S*</sub><sup>*t*</sup> = {*W*<sub>1</sub><sup>*t*</sup>, …, *W*<sub>*T*</sub><sup>*t*</sup>}, 用于计算编码向量的加权和。加权和用来生成*y*<sub>*t*</sub>。
-
-模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。
-
-我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例，并将其投射到 `decoder_size` 维空间完成：
-
-``` sourceCode
-# 定义源语句的数据层
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-# 计算每个词的词向量
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-# 应用前向循环神经网络
-src_forward = grumemory(input=src_embedding, size=encoder_size)
-# 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-src_backward = grumemory(input=src_embedding,
-                          size=encoder_size,
-                          reverse=True)
-# 将循环神经网络的前向和反向部分混合在一起
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-# 投射编码向量到 decoder_size
-encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                           size = decoder_size)
-
-# 计算反向RNN的第一个实例
-backward_first = first_seq(input=src_backward)
-
-# 投射反向RNN的第一个实例到 decoder size
-decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
-```
-
-解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-trg_embedding = embedding_layer(
-    input=data_layer(name='target_language_word',
-                     size=target_dict_dim),
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_target_language_embedding'))
-group_inputs.append(trg_embedding)
-
-# 对于配备有注意力机制的解码器，在训练中，
-# 目标向量（groudtruth）是数据输入，
-# 而源序列的编码向量可以被无边界的memory访问
-# StaticInput 意味着不同时间步的输入都是相同的值，
-# 否则它以一个序列输入，不同时间步的输入是不同的。
-# 所有输入序列应该有相同的长度。
-decoder = recurrent_group(name=decoder_group_name,
-                          step=gru_decoder_with_attention,
-                          input=group_inputs)
-```
-
-单步函数的实现如下所示。首先，它定义解码网络的**Memory**。然后定义 attention，门控循环单元单步函数和输出函数：
-
-``` sourceCode
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    # 定义解码器的Memory
-    # Memory的输出定义在 gru_step 内
-    # 注意 gru_step 应该与它的Memory名字相同
-    decoder_mem = memory(name='gru_decoder',
-                         size=decoder_size,
-                         boot_layer=decoder_boot)
-    # 计算 attention 加权编码向量
-    context = simple_attention(encoded_sequence=enc_vec,
-                               encoded_proj=enc_proj,
-                               decoder_state=decoder_mem)
-    # 混合当前词向量和attention加权编码向量
-    decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                           full_matrix_projection(current_word)],
-                                 size = decoder_size * 3)
-    # 定义门控循环单元循环神经网络单步函数
-    gru_step = gru_step_layer(name='gru_decoder',
-                              input=decoder_inputs,
-                              output_mem=decoder_mem,
-                              size=decoder_size)
-    # 定义输出函数
-    out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                      size=target_dict_dim,
-                      bias_attr=True,
-                      act=SoftmaxActivation())
-    return out
-```
-
-生成序列
------------------
-
-训练模型后，我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意，`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。
-
--   使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。
--   使用 `beam_search` 函数。这个函数需要设置：
-    -   `bos_id`: 开始标记。每个句子都以开始标记开头。
-    -   `eos_id`: 结束标记。每个句子都以结束标记结尾。
-    -   `beam_size`: beam search 算法中的beam大小。
-    -   `max_length`: 生成序列的最大长度。
--   使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置：
-    -   `id_input`: 数据的整数ID，用于标识生成的文件中的相应输出。
-    -   `dict_file`: 用于将词ID转换为词的字典文件。
-    -   `result_file`: 生成结果文件的路径。
-
-代码如下：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-# 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
-# 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
-# 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-trg_embedding = GeneratedInput(
-    size=target_dict_dim,
-    embedding_name='_target_language_embedding',
-    embedding_size=word_vector_dim)
-group_inputs.append(trg_embedding)
-beam_gen = beam_search(name=decoder_group_name,
-                       step=gru_decoder_with_attention,
-                       input=group_inputs,
-                       bos_id=0, # Beginnning token.
-                       eos_id=1, # End of sentence token.
-                       beam_size=beam_size,
-                       max_length=max_length)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=data_layer(name="sent_id", size=1),
-                          dict_file=trg_dict_path,
-                          result_file=gen_trans_file)
-outputs(beam_gen)
-```
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。
-
-完整的配置文件在`demo/seqToseq/seqToseq_net.py`。
diff --git a/doc/howto/deep_model/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
similarity index 86%
rename from doc/howto/deep_model/rnn_config_cn.rst
rename to doc/howto/deep_model/rnn/rnn_config_cn.rst
index e6d8c1133a5e8a481c9bf5340c4641343804dcbe..ac2bd0775f4ab2e0a0c37462e2c23001123b152b 100644
--- a/doc/howto/deep_model/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -1,4 +1,4 @@
-RNN 配置
+RNN配置
 ========
 
 本教程将指导你如何在 PaddlePaddle
@@ -20,7 +20,7 @@ PaddlePaddle
 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
 它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
 
-.. code:: sourcecode
+.. code:: python
 
     settings.input_types = [
       integer_value_sequence(len(settings.src_dict)),
@@ -29,12 +29,11 @@ PaddlePaddle
 
 在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
 
-.. code:: sourcecode
+.. code:: python
 
     yield src_ids, trg_ids, trg_ids_next
 
-有关如何编写数据提供程序的更多细节描述，请参考
-`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在
+有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
 ``demo/seqToseq/dataprovider.py``\ 。
 
 配置循环神经网络架构
@@ -45,18 +44,17 @@ PaddlePaddle
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-   :alt: image
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+      :align: center
 
-   image
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
 
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t*
-= 1 执行以下操作。
+.. math::
 
-*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ )
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
 
-其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ （即单时间步执行的函数，step
-function），而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
 sequence
@@ -67,16 +65,17 @@ vanilla
 
 对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
 
-*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b*
+.. math::
 
-其中 *x*\ \ *t*\  是RNN状态，并且 *I*\ \ *t*\  是输入，\ *W*\ \ *x*\  和
-*W*\ \ *i*\  分别是RNN状态和输入的变换矩阵。\ *b*
-是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
 
 ``recurrent_group``\ 是构建循环神经网络的最重要的工具。
 它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
 
-.. code:: sourcecode
+.. code:: python
 
     def simple_rnn(input,
                    size=None,
@@ -102,7 +101,7 @@ vanilla
 
 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
-Memory是在单步函数中循环使用的状态，例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
 一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
 layer(引导层)**\ ，其输出被用作Memory的初始值。
 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
@@ -120,30 +119,25 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
-   :alt: image
-
-   image
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center
 
-在这个模型中，源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ }
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
-*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为
-*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时,
-这个门控循环神经网络生成一系列权重
-*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ },
-用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
 
 模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
 ``recurrent_group``
-更快。我们已经实现了大多数常用的循环神经网络架构，可以参考
-`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__
-了解更多细节。
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
 
 我们还将编码向量投射到 ``decoder_size``
 维空间。这通过获得反向循环网络的第一个实例，并将其投射到
 ``decoder_size`` 维空间完成：
 
-.. code:: sourcecode
+.. code:: python
 
     # 定义源语句的数据层
     src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
@@ -174,7 +168,7 @@ Sequence to Sequence Model with Attention
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -198,7 +192,7 @@ Sequence to Sequence Model with Attention
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
 
-.. code:: sourcecode
+.. code:: python
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         # 定义解码器的Memory
@@ -253,7 +247,7 @@ attention，门控循环单元单步函数和输出函数：
 
 代码如下：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -279,9 +273,6 @@ attention，门控循环单元单步函数和输出函数：
                               result_file=gen_trans_file)
     outputs(beam_gen)
 
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅
-`Semantic Role Labeling
-Demo <../../demo/semantic_role_labeling/index.html>`__
-了解更多详细信息。
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
 
 完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
index e0a63f5a14c7b2e8953aa21739668ee2a9ebeff1..ee1b3213eaed3bfd94e449997dff9848b8fb4228 100644
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -33,7 +33,6 @@ cd Paddle
 git checkout -b develop  # 创建 develop 分支
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
 git pull upstream develop  # 更新 upstream
-git submodule update --init --recursive
 ```
 
 然后你可以通过做一个本地开发分支开始开发
diff --git a/doc/howto/dev/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
index e578f6fce8b94180da7d5de041a0e17b1d59f6ea..9b0d3e83c0dc264650eda73e6801c60a75439b4a 100644
--- a/doc/howto/dev/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -38,7 +38,6 @@ cd Paddle
 git checkout -b develop  # create develop branch.
 git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
-git submodule update --init --recursive
 ```
 
 Then you can start to develop by making a local developement branch
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9489a921c70ad6ee5709f46445554f5d9640162c
--- /dev/null
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -0,0 +1,389 @@
+================
+实现新的网络层
+================
+
+这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。
+
+1. 推导该层前向和后向传递的方程。
+2. 实现该层的C++类。
+3. 增加梯度检测的单元测试，以保证梯度的正确计算。
+4. 封装该层的Python接口。
+
+推导方程
+================
+
+首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入，计算输出。后向传播给定输出的梯度，计算输入和参数的梯度。
+
+下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
+
+..  image:: FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+一个网络层的前向传播部分把输入转化为相应的输出。
+全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入，使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量，并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。
+
+.. math::
+
+   y = f(W^T x + b)
+
+其中 :math:`f(.)` 是一个非线性的*激活方程*，例如sigmoid， tanh，以及Relu。
+
+变换矩阵 :math:`W` 和偏置向量 :math:`b`  是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度，分别计算每个参数的梯度，以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。
+
+假设损失函数是 :math:`c(y)` ，那么
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+假设 :math:`z = f(W^T x + b)` ，那么
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+PaddlePaddle的base layer类可以自动计算上面的导数。
+
+因此，对全连接层来说，我们需要计算：
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+其中 :math:`\mathbf 1` 是一个全1的向量， :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值， :math:`z_j` 是向量 :math:`z` 的第j个值， :math:`x_i` 是向量 :math:`x` 的第i个值。
+
+最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。
+
+实现C++类
+===================
+
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+
+这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
+
+- 类的构造函数和析构函数。
+- :code:`init` 函数。用于初始化参数和设置。
+- :code:`forward` 。实现网络层的前向传播。
+- :code:`backward` 。实现网络层的后向传播。
+- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新，则不需要重写该函数。（大多数网络层不需要支持远程稀疏更新）
+
+
+头文件如下：
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * 全连接层的每个输出都连接到上一层的所有的神经元上。
+     * 它的输入与经过学习的参数做内积并加上偏置（可选）。
+     *
+     * 配置文件接口是fc_layer。
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象，它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。
+
+- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。每个权重对应一个输入。
+- :code:`biases_` 是存有偏置向量的权重。
+
+全连接层没有网络层配置的超参数。如果一个网络层需要配置的话，通常的做法是将配置存于 :code:`LayerConfig& config` 中，并在类构建函数中把它放入一个类成员变量里。
+
+下面的代码片段实现了 :code:`init` 函数。
+
+- 首先，所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。
+- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。
+- 最后，初始化偏置向量。
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* 初始化父类 */
+      Layer::init(layerMap, parameterMap);
+
+      /* 初始化权重表 */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // 获得参数尺寸
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // 新建一个权重
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // 将新建的权重加入权重表
+        weights_.emplace_back(w);
+      }
+
+      /* 初始化biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+实现前向传播的部分有下面几个步骤。
+
+- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
+- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* 若有必要，为output_申请内存 */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // 设置输出的尺寸
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // 对每个输入乘上变换矩阵
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* 加上偏置向量 */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* 激活 */ {
+        forwardActivation();
+      }
+    }
+
+实现后向传播的部分有下面几个步骤。
+
+- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度，调用该函数后，梯度会就地（不使用额外空间）乘上输出的梯度。
+- 计算偏置的梯度。注意，我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后，**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。
+- 最后，计算转换矩阵和输入的梯度，并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度，从而可以做一些与计算重叠的工作（例如，网络通信）。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* 对激活求导 */ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* 计算当前层权重的梯度 */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* 计算输入层的偏差 */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+ :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时，完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时，该批次的输入中仅有一个子集是非零的。因此，该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。
+
+大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+最后，使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符， :code:`FullyConnectedLayer` 是该层的类名。
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+
+
+写梯度检查单元测试
+===============================
+
+写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
+
+所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+
++ 生成网络层配置。网络层配置包含以下几项：
+   - 偏置参数的大小。（例子中是4096）
+   - 层的类型。（例子中是fc）
+   - 层的大小。（例子中是4096）
+   - 激活的类型。（例子中是softmax）
+   - dropout的比例。（例子中是0.1）
++ 配置网络层的输入。在这个例子里，我们仅有一个输入。
+   - 输入的类型（ :code:`INPUT_DATA` ），可以是以下几种：
+       - :code:`INPUT_DATA` ：稠密向量。
+       - :code:`INPUT_LABEL` ：整数。
+       - :code:`INPUT_DATA_TARGET` ：稠密向量，但不用于计算梯度。
+       - :code:`INPUT_SEQUENCE_DATA` ：含有序列信息的稠密向量。
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA` ：含有序列信息和子序列信息的稠密向量。
+       - :code:`INPUT_SEQUENCE_LABEL` ：含有序列信息的整数。
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA` ：0-1稀疏数据。
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` ：浮点稀疏数据。
+   - 输入的名字。（例子中是 :code:`layer_0` ）
+   - 输入的大小。（例子中是8192）
+   - 非零数字的个数，仅对稀疏数据有效。
+   - 稀疏数据的格式，仅对稀疏数据有效。
++ 对每个输入，都需要调用一次 :code:`config.layerConfig.add_inputs();` 。
++ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。
+   - 层和输入的配置。（例子中是 :code:`config` ）
+   - 网络层的类型。（例子中是 :code:`fc` ）
+   - 梯度检查的输入数据的批次大小。（例子中是100）
+   - 输入是否是转置的。大多数层需要设置为 :code:`false` 。（例子中是 :code:`false` ）
+   - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如，softmax激活的输出的和总是1。在这种情况下，我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和（非常数）来计算梯度。（例子中是 :code:`true` ，因为全连接层的激活可以是softmax）
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+实现python封装
+========================
+
+python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步：
+
+- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。
+- 实现构造函数 :code:`__init__` 。
+	- 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作，这些名字必须要写对。
+	- 之后，计算变换矩阵的大小和格式（是否稀疏）。
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+在网络配置中，网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括：
+
+- :code:`name` 是网络层实例的名字标识符。
+- :code:`type` 是网络层的类型，通过网络层的标识符来指定。
+- :code:`size` 是网络层输出的大小。
+- :code:`bias` 表明这个层的一个实例是否需要偏置。
+- :code:`inputs` 说明这个层的输入，输入是由一个list中的网络层实例的名字组成的。
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+我们建议你为你的Python封装实现一个“助手”，使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 0513f068f39ad0d931b03d066a0083a1a8a33b79..46481f5ead33dc6a26507e021fd9ae0f8316e940 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -209,7 +209,6 @@ The implementation of the backward part has the following steps.
       if (biases_ && biases_->getWGrad()) {
         biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
 
-        /* Increasing the number of gradient */
         biases_->getParameterPtr()->incUpdate(callback);
       }
 
@@ -297,7 +296,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
 + each inputs needs to call :code:`config.layerConfig.add_inputs();` once.
 + call :code:`testLayerGrad` to perform gradient checks. It has the following arguments.
    - layer and input configurations. (:code:`config` in our example)
-   - type of the input. (:code:`fc` in our example)
+   - type of the layer. (:code:`fc` in our example)
    - batch size of the gradient check. (100 in our example)
    - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example)
    - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax)
@@ -310,7 +309,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       config.biasSize = 4096;
       config.layerConfig.set_type("fc");
       config.layerConfig.set_size(4096);
-      config.layerConfig.set_active_type("sigmoid");
+      config.layerConfig.set_active_type("softmax");
       config.layerConfig.set_drop_rate(0.1);
       // Setup inputs.
       config.inputDefs.push_back(
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 6a14ce8ae75c3dd372184ea6ea9f6034a3dbf919..5b84eea491f874459ed2071e4c942657cdc9b18b 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -7,10 +7,12 @@
 ..  toctree::
   :maxdepth: 1
 
+  usage/cmd_parameter/index_cn.rst
   usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/cluster/k8s/k8s_cn.md
-  usage/cluster/k8s/k8s_distributed_cn.md
+  usage/k8s/k8s_basis_cn.md
+  usage/k8s/k8s_cn.md
+  usage/k8s/k8s_distributed_cn.md
 
 开发标准
 --------
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 983dc743eb453a0210bc5fb3c7e4525fa838d428..1fbfcd260b912078f00ed5b720ed607db725c4e2 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,8 +7,10 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  usage/cmd_parameter/index_en.md
+  usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
+  usage/k8s/k8s_en.md
+  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index acdcfa1c0047ced85c0a9c53d691edc0b4489336..274452fbf0c595ad7b4dbeffe85ad9038f12b458 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -6,7 +6,7 @@
 
 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
 
 ## 前提条件
 
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 30963dcd927250651f3ed0b39949f541cc28ed4a..c60876721cbf5565d6e48c8061811aacada748cd 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -2,7 +2,7 @@
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
 
 ## Prerequisite
 
diff --git a/doc/howto/usage/cluster/k8s/job.yaml b/doc/howto/usage/cluster/k8s/job.yaml
deleted file mode 100644
index 488aad0bede4f940b25c7be04259f209c3de9f52..0000000000000000000000000000000000000000
--- a/doc/howto/usage/cluster/k8s/job.yaml
+++ /dev/null
@@ -1,43 +0,0 @@
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: paddle-cluster-job
-spec:
-  parallelism: 3
-  completions: 3
-  template:
-    metadata:
-      name: paddle-cluster-job
-    spec:
-      volumes:
-      - name: jobpath
-        hostPath: 
-          path: /home/work/paddle_output              
-      containers:
-      - name: trainer
-        image: registry.baidu.com/public/paddle:mypaddle
-        command: ["bin/bash",  "-c", "/root/start.sh"]        
-        env:
-        - name: JOB_NAME
-          value: paddle-cluster-job
-        - name: JOB_PATH
-          value: /home/jobpath     
-        - name: JOB_NAMESPACE
-          value: default         
-        - name: TRAIN_CONFIG_DIR
-          value: recommendation
-        - name: CONF_PADDLE_NIC
-          value: eth0  
-        - name: CONF_PADDLE_PORT
-          value: "7164"
-        - name: CONF_PADDLE_PORTS_NUM
-          value: "2"     
-        - name: CONF_PADDLE_PORTS_NUM_SPARSE
-          value: "2"  
-        - name: CONF_PADDLE_GRADIENT_NUM
-          value: "3"                                                               
-        volumeMounts:
-        - name: jobpath
-          mountPath: /home/jobpath       
-      restartPolicy: Never
-    
diff --git a/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
deleted file mode 100644
index a8c64550b1fa7f41de1eaa9a037c65cddc0cd30e..0000000000000000000000000000000000000000
Binary files a/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png and /dev/null differ
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..2e2a2fcc54a09f4f41e4ebbc317e1409591ddd9c
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -0,0 +1,404 @@
+# 参数概述
+
+虽然Paddle看起来包含了众多参数，但是大部分参数是为开发者提供的，或者已经在集群提交环境中自动设置，因此用户并不需要关心它们。在此，根据这些参数的使用场合，我们将它们划分为不同的类别。例如，`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中，而有些参数需要在集群多机训练中使用等。
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">参数</th>
+<th scope="col" class="left">本地训练</th>
+<th scope="col" class="left">集群训练</th>
+<th scope="col" class="left">本地测试</th>
+<th scope="col" class="left">集群测试</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">通用</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">训练/测试</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">递归神经网络(RNN)</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">data_server_port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
index 013edbc9047817d7f6b82c4d5188412bd2ce41d6..e5546f0ddc78a9f8bdc306a19c2fe9a415463e5a 100644
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>
 
-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3b573a324d541b024600a254d5266e517db229c5
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -0,0 +1,332 @@
+# 细节描述
+
+## 通用
+
+* `--job`
+  - 工作模式，包括: **train, test, checkgrad**，其中checkgrad主要为开发者使用，使用者不需要关心。
+  - 类型: string (默认: train)
+
+* `--config`
+  - 用于指定网络配置文件。
+  - 类型: string (默认: null).
+
+* `--use_gpu`
+  - 训练过程是否使用GPU，设置为true使用GPU模式，否则使用CPU模式。
+  - 类型: bool (默认: 1).
+
+* `--local`
+  - 训练过程是否为本地模式，设置为true使用本地训练或者使用集群上的一个节点，否则使用多机训练。
+  - 类型: bool (默认: 1).
+
+* `--trainer_count`
+  - 指定一台机器上使用的线程数。例如，trainer_count = 4, 意思是在GPU模式下使用4个GPU，或者在CPU模式下使用4个线程。每个线程（或GPU）分配到当前数据块样本数的四分之一。也就是说，如果在训练配置中设置batch_size为512，每个线程分配到128个样本用于训练。
+  - 类型: int32 (默认: 1).
+
+* `--num_passes`
+  - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时，意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
+  - 类型: int32 (默认: 100).
+
+* `--config_args`
+  - 传递给配置文件的参数。格式: key1=value1,key2=value2.
+  - 类型: string (默认: null).
+
+* `--version`
+  - 是否打印版本信息。
+  - 类型: bool (默认: 0).
+
+* `--show_layer_stat`
+  - 是否显示**每个批次数据**中每层的数值统计.
+  - 类型: bool (默认: 0).
+
+## 训练
+
+* `--log_period`
+  - 每log_period个批次打印日志进度.
+  - 类型: int32 (默认: 100).
+
+* `--dot_period`
+  - 每dot_period个批次输出符号'.'.
+  - 类型: int32 (默认: 1).
+
+* `--saving_period`
+  - 每saving_period轮保存训练参数.
+  - 类型: int32 (默认: 1).
+
+* `--save_dir`
+  - 保存模型参数的目录，需要明确指定，但不需要提前创建。
+  - 类型: string (默认: null).
+
+* `--start_pass`
+  - 从start_pass轮开始训练，会加载上一轮的参数。
+  - 类型: int32 (默认: 0).
+
+* `--show_parameter_stats_period`
+  - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
+  - 类型: int32 (默认: 0).
+
+* `--save_only_one`
+  - 只保存最后一轮的参数，而之前的参数将会被删除。
+  - 类型: bool (默认: 0).
+
+* `--load_missing_parameter_strategy`
+  - 当模型参数不存在时，指定加载的方式。目前支持fail/rand/zero三种操作.
+    - `fail`: 程序直接退出.
+    - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
+    - `zero`: 所有参数置为零.
+  - 类型: string (默认: fail).
+
+* `--init_model_path`
+   - 初始化模型的路径。如果设置该参数，start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
+   - 类型: string (默认: null).
+
+* `--saving_period_by_batches`
+   - 在一轮中每saving_period_by_batches个批次保存一次参数。
+   - 类型: int32 (默认: 0).
+
+* `--log_error_clipping`
+  - 当在网络层配置中设置**error_clipping_threshold**时，该参数指示是否打印错误截断日志。如果为true，**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--log_clipping`
+  - 当在训练配置中设置**gradient_clipping_threshold**时，该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--use_old_updater`
+  - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater，主要为开发者使用，使用者通常无需关心.
+  - 类型: bool (默认: 0).
+
+* `--enable_grad_share`
+  - 启用梯度参数的阈值，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - 梯度参数的分块数目，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 64).
+
+## 测试
+
+* `--test_pass`
+  - 加载test_pass轮的模型用于测试.
+  - 类型: int32 (默认: -1).
+
+* `--test_period`
+   - 如果为0，每轮结束时对所有测试数据进行测试；如果不为0，每test_period个批次对所有测试数据进行测试.
+  - 类型: int32 (默认: 0).
+
+* `--test_wait`
+  - 指示当指定轮的测试模型不存在时，是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试，可以使用该参数.
+  - 类型: bool (默认: 0).
+
+* `--model_list`
+  - 测试时指定的存储模型列表的文件.
+  - 类型: string (默认: "", null).
+
+* `--predict_output_dir`
+  - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定，默认为null，意思是不保存结果。在测试阶段，如果你想要保存某些层的特征图，请指定该目录。需要注意的是，网络层的输出是经过激活函数之后的值.
+  - 类型: string (默认: "", null).
+
+* `--average_test_period`
+  - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除，默认为0，意思是不使用平均参数执行测试.
+  - 类型: int32 (默认: 0).
+
+* `--distribute_test`
+  - 在分布式环境中测试，将多台机器的测试结果合并.
+  - 类型: bool (默认: 0).
+
+* `--predict_file`
+  - 保存预测结果的文件名。该参数默认为null，意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层，每轮都会保存预测结果.
+  - 类型: string (默认: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - 指示使用哪个GPU核.
+  - 类型: int32 (默认: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - 如果为true，一个GPU设备上不允许配置多个模型.
+  - 类型: bool (默认: 1).
+
+* `--parallel_nn`
+  - 指示是否使用多线程来计算一个神经网络。如果为false，设置gpu_id指定使用哪个GPU核（训练配置中的设备属性将会无效）。如果为true，GPU核在训练配置中指定（gpu_id无效）.
+  - 类型: bool (默认: 0).
+
+* `--cudnn_dir`
+  - 选择路径来动态加载NVIDIA CuDNN库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cuda_dir`
+  - 选择路径来动态加载NVIDIA CUDA库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - 指定cuDNN的最大工作空间容限，单位是MB，默认为4096MB=4GB. 
+  - 类型: int32 (默认: 4096MB=4GB)
+
+## 自然语言处理(NLP): RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
+  - 类型: bool (默认: 0).
+
+* `--prev_batch_state`
+  - 标识是否为连续的batch计算.
+  - 类型: bool (默认: 0).
+
+* `--beam_size`
+  - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上，都会产生当前层状态的所有继承结果，按启发式损失的大小递增排序。然而，每层上只能保存固定数目个最好的状态，该数目是提前定义好的，称之为集束大小.
+  - 类型: int32 (默认: 1).
+
+* `--diy_beam_search_prob_so`
+  - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
+  - 类型: string (默认: "", null).
+
+## 度量学习(Metric Learning)
+* `--external`
+   - 指示是否使用外部机器进行度量学习.
+   - 类型: bool (默认: 0).
+
+* `--data_server_port`
+  - 数据服务器(data server)的监听端口，主要用在度量学习中.
+  - 类型: int32 (默认: 21134).
+
+## 数据支持(DataProvider)
+
+* `--memory_threshold_on_load_data`
+  - 内存容限阈值，当超过该阈值时，停止加载数据.
+  - 类型: double (默认: 1.0).
+
+## 单元测试
+
+* `--checkgrad_eps`
+  - 使用checkgrad模式时的参数变化大小.
+  - 类型: double (默认: 1e-05).
+
+## 参数服务器和分布式通信
+
+* `--start_pserver`
+  - 指示是否开启参数服务器(parameter server).
+  - 类型: bool (默认: 0).
+
+* `--pservers`
+  - 参数服务器的IP地址，以逗号间隔.
+  - 类型: string (默认: "127.0.0.1").
+
+* `--port`
+  - 参数服务器的监听端口.
+  - 类型: int32 (默认: 20134).
+
+* `--ports_num`
+  - 发送参数的端口号，根据默认端口号递增.
+  - 类型: int32 (默认: 1).
+
+* `--trainer_id`
+  - 在分布式训练中，每个训练节点必须指定一个唯一的id号，从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
+  - 类型: int32 (默认: 0).
+
+* `--num_gradient_servers`
+  - 梯度服务器的数量，该参数在集群提交环境中自动设置.
+  - 类型: int32 (默认: 1).
+
+* `--small_messages`
+  - 如果消息数据太小，建议将该参数设为true，启动快速应答，无延迟.
+  - 类型: bool (默认: 0).
+
+* `--sock_send_buf_size`
+  - 限制套接字发送缓冲区的大小。如果仔细设置的话，可以有效减小网络的阻塞.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - 限制套接字接收缓冲区的大小.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - 参数服务器的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--parameter_block_size_for_sparse`
+  - 参数服务器稀疏更新的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--log_period_server`
+  - 在参数服务器终端每log_period_server个批次打印日志进度.
+  - 类型: int32 (默认: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - 在参数服务器上加载和保存参数，只有当设置了sparse_remote_update参数时才有效.
+  - 类型: bool (默认: 0).
+
+* `--pserver_num_threads`
+  - 同步执行操作的线程数.
+  - 类型: bool (默认: 1).
+
+* `--ports_num_for_sparse`
+  - 发送参数的端口号，根据默认值递增(port + ports_num)，用于稀疏训练中.
+  - 类型: int32 (默认: 0).
+
+* `--nics`
+  - 参数服务器的网络设备名称，已经在集群提交环境中完成设置.
+  - 类型: string (默认: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - 使用rdma还是tcp传输协议，该参数已经在集群提交环境中完成设置.
+  - 类型: string (默认: "tcp").
+
+## 异步随机梯度下降(Async SGD)
+* `--async_count`
+  - 定义异步训练的长度，如果为0，则使用同步训练.
+  - 类型: int32 (默认: 0).
+
+* `--async_lagged_ratio_min`
+  - 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
+  - 类型: double (默认: 1.0).
+
+* `--async_lagged_ratio_default`
+  - 如果在网络配置中未设置async_lagged_grad_discard_ratio，则使用该参数作为默认值.
+  - 类型: double (默认: 1.5).
+
+## 性能调优(Performance Tuning)
+
+* `--log_barrier_abstract`
+  - 如果为true，则显示阻隔性能的摘要信息.
+  - 类型: bool (默认: 1).
+
+* `--log_barrier_show_log`
+  - 如果为true，则总会显示阻隔摘要信息，即使间隔很小.
+  - 类型: bool (默认: 0).
+
+* `--log_barrier_lowest_nodes`
+  - 最少显示多少个节点.
+  - 类型: int32 (默认: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
+  - 类型: bool (默认: 0).
+
+* `--show_check_sparse_distribution_log`
+  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
+  - 类型: bool (默认: 0).
+
+* `--check_sparse_distribution_batches`
+  - 每运行多少个批次执行一次稀疏参数分布的检查.
+  - 类型: int32 (默认: 100).
+
+* `--check_sparse_distribution_ratio`
+  - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio *  check_sparse_distribution_batches次，程序停止.
+  - 类型: double (默认: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - 不同参数服务器上数据大小的最大值与最小值的比率.
+  - 类型: double (默认: 2).
+
+## 矩阵/向量/随机数
+* `--enable_parallel_vector`
+  - 启动并行向量的阈值.
+  - 类型: int32 (默认: 0).
+
+* `--seed`
+  - 随机数的种子。srand(time)的为0.
+  - 类型: int32 (默认: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - 是否将全局种子应用于本地线程的随机数.
+  - 类型: bool (默认: 0).
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 27b2faf1d8a9367ff9498a76d363791ab7fbe61c..33b7ec0d51a96ee126197e7aa819fdae0d3dc353 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -73,7 +73,7 @@
   - type: bool (default: 0).
 
 * `--load_missing_parameter_strategy`
-  - Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
     - `fail`: program will exit.
     - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
     - `zero`: all parameters are zero.
@@ -118,11 +118,11 @@
   - type: int32 (default: 0).
 
 * `--test_wait`
-  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
   - type: bool (default: 0).
 
 * `--model_list`
-  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
+  - File that saves the model list when testing. 
   - type: string (default: "", null).
 
 * `--predict_output_dir`
@@ -212,7 +212,7 @@
   - type: bool (default: 0).
 
 * `--pservers`
-  - Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
+  - Comma separated IP addresses of pservers.
   - type: string (default: "127.0.0.1").
 
 * `--port`
@@ -310,10 +310,6 @@
   - show log details for sparse parameter distribution in pserver.
   - type: bool (default: 0).
 
-* `--allow_inefficient_sparse_update`
-  - Whether to allow inefficient sparse update.
-  - type: bool (default: 0).
-
 * `--check_sparse_distribution_batches`
   - Running sparse parameter distribution check every so many batches.
   - type: int32 (default: 100).
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4c8729821110b9aec99351fc0a83a1ba75a8a2bb
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+设置命令行参数
+===============
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+  arguments_cn.md
+  detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
deleted file mode 100644
index 2a96e7e976c43fd69befccd78753cee431ef61bc..0000000000000000000000000000000000000000
--- a/doc/howto/usage/cmd_parameter/index_en.md
+++ /dev/null
@@ -1,8 +0,0 @@
-```eval_rst
-..  _cmd_line_index:
-```
-# Set Command-line Parameters
-
-* [Use Case](use_case_en.md)
-* [Arguments](arguments_en.md)
-* [Detailed Descriptions](detail_introduction_en.md)
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/usage/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0e3c72d27aca063f1b6f1c23e55718dba373c40a
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+  arguments_en.md
+  detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/usage/cmd_parameter/use_case_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..db8c39d950771726346ff9c9481990abc13036cf
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/use_case_cn.md
@@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/usage/cmd_parameter/use_case_en.md
index 4d7bb33f36fe258ee24796eedc9296065923e58f..e287f0c4b9617cbc6504596512bf408c56dc10f9 100644
--- a/doc/howto/usage/cmd_parameter/use_case_en.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
@@ -134,14 +134,14 @@ fc2=fc_layer(...)
 fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
 
 ```
-- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
 
-- device=-1: use the CPU for layer l3.
+- device=-1: use the CPU for layer fc3.
 
 - trainer_count:
-  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
 
-  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
 
 ### Case 2: Specify Layers in Different Devices
 
@@ -157,14 +157,14 @@ fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
 In this case, we assume that there are 4 GPUs in one machine.
 
 - trainer_count=1:
-  - Use GPU 0 to compute layer l2.
-  - Use GPU 1 to compute layer l3.
-  - Use CPU to compute layer l4.
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.
 
 - trainer_count=2:
-  - Use GPU 0 and 1 to compute layer l2.
-  - Use GPU 2 and 3 to compute layer l3.
-  - Use CPU to compute l4 in two threads.
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.
 
 - trainer_count=4:
   - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..ce72b0803818d5bf0c18753c421848cf2fc1b668
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -0,0 +1,689 @@
+
+# Distributed PaddlePaddle Training on AWS with Kubernetes
+
+We will show you step by step on how to run distributed PaddlePaddle training on AWS cluster with Kubernetes. Let's start from core concepts.
+
+## Distributed PaddlePaddle Training Core Concepts
+
+### Distributed Training Job
+
+A distributed training job is represented by a [Kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
+
+Each Kuberentes job is described by a job config file, which specifies the information like the number of [pods](https://kubernetes.io/docs/user-guide/pods/#what-is-a-pod) in the job and environment variables.
+
+In a distributed training job, we would:
+
+1. prepare partitioned training data and configuration file on a distributed file system (in this tutorial we use Amazon Elastic File System), and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
+
+### Parameter Servers and Trainers
+
+There are two roles in a PaddlePaddle cluster: *parameter server (pserver)* and *trainer*. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
+
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
+
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. However, now we are using static ip. This will be improved.
+
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
+
+### Trainer ID
+
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
+
+### Training
+
+The entry-point of a container is a shell script. It can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
+
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
+1. Copy the training data from EFS persistent volume into container.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
+
+
+## PaddlePaddle on AWS with Kubernetes
+
+### Choose AWS Service Region
+This tutorial requires several AWS services work in the same region. Before we create anything in AWS, please check the following link
+https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/
+Choose a region which has the following services available: EC2, EFS, VPS, CloudFormation, KMS, VPC, S3.
+In this tutorial, we use "Oregon(us-west-2)" as example.
+
+### Create AWS Account and IAM Account
+
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
+
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create IAM users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+
+Please be aware that this tutorial needs the following privileges for the user in IAM:
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+- AWSKeyManagementServicePowerUser
+
+
+### Download kube-aws and kubectl
+
+#### kube-aws
+
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
+##### Verify kube-aws integrity
+Note: if you are using a non-official release (e.g RC release) kube-aws, you can skip this setp.
+Import the CoreOS Application Signing Public Key:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+Validate the key fingerprint:
+
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
+
+Validate the tarball's GPG signature:
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+##### Install kube-aws
+Extract the binary:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+Add kube-aws to your path:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+#### kubectl
+
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
+
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
+
+```
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
+
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
+```
+
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
+
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
+
+### Configure AWS Credentials
+
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
+
+And then configure your AWS account information:
+
+```
+aws configure
+```
+
+
+Fill in the required fields:
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+Verify that your credentials work by describing any instances you may already have running on your account:
+
+```
+aws ec2 describe-instances
+```
+
+### Define Cluster Parameters
+
+#### EC2 key pair
+
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+
+Follow [EC2 Keypair User Guide](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
+
+After creating a key pair, you will use the key pair name to configure the cluster.
+
+Key pairs are only available to EC2 instances in the same region. We are using us-west-2 in our tutorial, so make sure to creat key pairs in that region (Oregon).
+
+Your browser will download a `key-name.pem` file which is the key to access the EC2 instances. We will use it later.
+
+
+#### KMS key
+
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+
+You can create a KMS key with the aws command line tool:
+
+```
+aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+We will need to use the value of `Arn` later.
+
+And then let's add several inline policies in your IAM user permission.
+
+Go to [IAM Console](https://console.aws.amazon.com/iam/home?region=us-west-2#/home). Click on button `Users`, click user that we just created, and then click on `Add inline policy` button, and select `Custom Policy`.
+
+Paste into following inline policies:
+
+```
+ (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node){
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
+            ]
+        },
+		{
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+`Version` : Its value has to be exactly "2012-10-17".
+`AWS_ACCOUNT_ID`: You can get it from following command line:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
+
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well. 
+Please note, stack name must satisfy regular expression pattern: [a-zA-Z][-a-zA-Z0-9*]*, which means no "_" or "-" in stack name, or kube-aws will throw error in later steps.
+
+#### External DNS name
+
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+DNS name should have a CNAME points to cluster DNS name or an A record points to the cluster IP address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster IP with that DNS name for your local machine. And add name service (route53) in aws to associate the IP to paddle for cluster. We will find the cluster IP in later steps.
+
+#### S3 bucket
+
+You need to create an S3 bucket before startup the Kubernetes cluster.
+
+There are some bugs in aws cli in creating S3 bucket, so let's use the [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2).
+
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-2 (Oregon).
+
+
+#### Initialize Assets
+
+Create a directory on your local machine to hold the generated assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+
+```
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
+--region=us-west-2 \
+--availability-zone=us-west-2a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+Here `us-west-2a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-2a` is supported by `aws ec2 --region us-west-2 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-2a`, or `us-west-2b`)
+
+
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+
+By default `kube-aws` will only create one worker node. Let's edit `cluster.yaml` and change `workerCount` from 1 to 3.
+
+
+#### Render contents of the asset directory
+
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+
+```
+kube-aws render credentials --generate-ca
+```
+
+The next command generates the default set of cluster assets in your asset directory.
+
+```
+kube-aws render stack
+```
+Assets (templates and credentials) that are used to create, update and interact with your Kubernetes cluster will be created under your current folder.
+
+
+### Kubernetes Cluster Start Up
+
+#### Create the instances defined in the CloudFormation template
+
+Now let's create your cluster (choose any `PREFIX` for the command below):
+
+```
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
+```
+
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
+
+
+#### Configure DNS
+
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
+
+```
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+```
+
+If you own a DNS name, set the A record to any of the above ip. __Or__ you can set up CNAME point to `Controller DNS Name` (`paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com`)
+
+##### Find IP address
+
+Use command `dig` to check the load balancer hostname to get the ip address.
+
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com
+
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. IN A
+
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-2.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
+
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
+
+*If you own a DNS name*, set the A record to any of the above ip. Then you can skip to the step "Access the cluster".
+
+*If you do not own a DNS name*:
+##### Update local DNS association
+Edit `/etc/hosts` to associate above ip with the DNS name.
+##### Add Route53 private name service in VPC
+ - Open [Route53 Console](https://console.aws.amazon.com/route53/home)
+ - Create hosted zone with following config
+   - Domain name: "paddle"
+   - Type: "Private hosted zone for amazon VPC"
+   - VPC ID: `<Your VPC ID>`
+
+   ![route53 zone setting](src/route53_create_zone.png)
+ - Add A record
+    - Click on the zone "paddle" just created
+    - Click the button "Create record set"
+        - Name : leave blank
+        - type: "A"
+        - Value: `<kube-controller ec2 private ip>`
+
+        ![route53 create recordset](src/route53_create_recordset.png)
+ - Verify name service
+    - Connect to any instance created by kube-aws via ssh
+    - Run command "host paddle", see if the ip returned is the private ip of kube-controller
+
+#### Access the cluster
+
+Once the API server is running, you should see:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes 
+NAME                                       STATUS    AGE
+ip-10-0-0-134.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-238.us-west-2.compute.internal   Ready     6m
+ip-10-0-0-50.us-west-2.compute.internal    Ready     6m
+ip-10-0-0-55.us-west-2.compute.internal    Ready     6m
+```
+
+
+### Setup Elastic File System for Cluster
+
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS.
+
+1. Create security group for EFS in [security group console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId)
+  1. Look up security group id for `paddle-cluster-sg-worker` (`sg-055ee37d` in the image below)
+  <center>![](src/worker_security_group.png)</center>
+  2. Add security group `paddle-efs` with `ALL TCP` inbound rule and custom source as group id of `paddle-cluster-sg-worker`. And VPC of `paddle-cluster-vpc`. Make sure availability zone is same as the one you used in [Initialize Assets](#initialize-assets).
+  <center>![](src/add_security_group.png)</center>
+
+2. Create the Elastic File System in [EFS console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2#/wizard/1) with `paddle-cluster-vpc` VPC. Make sure subnet is `paddle-cluster-Subnet0` andd security group is `paddle-efs`.
+<center>![](src/create_efs.png)</center>
+
+
+### Start PaddlePaddle Training Demo on AWS
+
+#### Configure Kubernetes Volume that Points to EFS
+
+First we need to create a [PersistentVolume](https://kubernetes.io/docs/user-guide/persistent-volumes/) to provision EFS volumn.
+
+Save following snippet as `pv.yaml`
+```
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: efsvol
+spec:
+  capacity:
+    storage: 100Gi
+  accessModes:
+    - ReadWriteMany
+  nfs:
+    server: EFS_DNS_NAME
+    path: "/"
+```
+
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Looks similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`
+
+Run following command to create a persistent volumn:
+```
+kubectl --kubeconfig=kubeconfig create -f pv.yaml
+```
+
+Next let's create a [PersistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/) to claim the persistent volume.
+
+Save following snippet as `pvc.yaml`.
+```
+kind: PersistentVolumeClaim
+apiVersion: v1
+metadata:
+  name: efsvol
+spec:
+  accessModes:
+    - ReadWriteMany
+  resources:
+    requests:
+      storage: 50Gi
+```
+
+Run following command to create a persistent volumn claim:
+```
+kubectl --kubeconfig=kubeconfig create -f pvc.yaml
+```
+
+#### Prepare Training Data
+
+We will now launch a kubernetes job that downloads, saves and evenly splits training data into 3 shards on the persistent volumn that we just created.
+
+save following snippet as `paddle-data-job.yaml`
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      containers:
+      - name: paddle-data
+        image: paddledev/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/efs"
+          name: efs
+        env:
+        - name: OUT_DIR
+          value: /efs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: efs
+          persistentVolumeClaim:
+            claimName: efsvol
+      restartPolicy: Never
+```
+
+Run following command to launch the job:
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-data-job.yaml
+```
+
+Job may take 7 min to finish, use following command to check job status. Do not proceed until `SUCCESSFUL` for `paddle-data` job is `1`
+```
+$ kubectl --kubeconfig=kubeconfig get jobs
+NAME          DESIRED   SUCCESSFUL   AGE
+paddle-data   1         1            6m
+```
+
+Data preparation is done by docker image `paddledev/paddle-tutorial:k8s_data`, see [here](src/k8s_data/README.md) for how to build this docker image and source code.
+
+#### Start Training
+
+Now we are ready to start paddle training job. Save following snippet as `paddle-cluster-job.yaml`
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: efs
+        persistentVolumeClaim:
+          claimName: efsvol
+      containers:
+      - name: trainer
+        image: paddledev/paddle-tutorial:k8s_train
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: quick_start
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        - name: TRAINER_COUNT
+          value: "3"
+        volumeMounts:
+        - mountPath: "/home/jobpath"
+          name: efs
+        ports:
+        - name: jobport0
+          hostPort: 7164
+          containerPort: 7164
+        - name: jobport1
+          hostPort: 7165
+          containerPort: 7165
+        - name: jobport2
+          hostPort: 7166
+          containerPort: 7166
+        - name: jobport3
+          hostPort: 7167
+          containerPort: 7167
+      restartPolicy: Never
+```
+
+`parallelism: 3, completions: 3` means this job will simultaneously start 3 PaddlePaddle pods, and this job will be finished when there are 3 finished pods.
+
+`env` field represents container's environment variables, we specify PaddlePaddle parameters by environment variables.
+
+`ports` indicates that TCP port 7164 - 7167 are exposed for communication between `pserver` ans trainer. port starts continously from `CONF_PADDLE_PORT` (7164) to `CONF_PADDLE_PORT + CONF_PADDLE_PORTS_NUM + CONF_PADDLE_PORTS_NUM_SPARSE - 1` (7167). We use multiple ports for dense and sparse paramter updates to improve latency.
+
+Run following command to launch the job.
+```
+kubectl --kubeconfig=kubeconfig create -f paddle-claster-job.yaml
+```
+
+Inspect individual pods
+
+```
+$ kubectl --kubeconfig=kubeconfig get pods
+NAME                       READY     STATUS    RESTARTS   AGE
+paddle-cluster-job-cm469   1/1       Running   0          9m
+paddle-cluster-job-fnt03   1/1       Running   0          9m
+paddle-cluster-job-jx4xr   1/1       Running   0          9m
+```
+
+Inspect individual console output
+```
+kubectl --kubeconfig=kubeconfig log -f POD_NAME
+```
+
+`POD_NAME`: name of any pod (e.g., `paddle-cluster-job-cm469`).
+
+Run `kubectl --kubeconfig=kubeconfig describe job paddle-cluster-job` to check training job status. It will complete in around 20 minutes.
+
+The details for start `pserver` and `trainer` are hidden inside docker image `paddledev/paddle-tutorial:k8s_train`, see [here](src/k8s_train/README.md) for how to build the docker image and source code.
+
+#### Inspect Training Output
+
+Training output (model snapshot and logs) will be saved in EFS. We can ssh into worker EC2 instance, mount EFS and check training output.
+
+1. ssh Into Worker EC2 instance
+```
+chmod 400 key-name.pem
+ssh -i key-name.pem core@INSTANCE_IP
+```
+
+`INSTANCE_IP`: public IP address of EC2 kubernetes worker node. Go to [EC2 console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#Instances:sort=instanceId) and check `public IP` of any `paddle-cluster-kube-aws-worker` instance.
+
+2. Mount EFS
+```
+mkdir efs
+sudo mount -t nfs4 -o nfsvers=4.1,rsize=1048576,wsize=1048576,hard,timeo=600,retrans=2 EFS_DNS_NAME:/ efs
+```
+
+`EFS_DNS_NAME`: DNS name as shown in description of `paddle-efs` that we created. Look similar to `fs-2cbf7385.efs.us-west-2.amazonaws.com`.
+
+Now folder `efs` will have structure similar to:
+```
+-- paddle-cluster-job
+    |-- ...
+    |-- output
+    |   |-- node_0
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_1
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- node_2
+    |   |   |-- server.log
+    |   |   `-- train.log
+    |   |-- pass-00000
+    |   |   |-- ___fc_layer_0__.w0
+    |   |   |-- ___fc_layer_0__.wbias
+    |   |   |-- done
+    |   |   |-- path.txt
+    |   |   `-- trainer_config.lr.py
+	|   |-- pass-00001...
+```
+`server.log` contains log for `pserver`. `train.log` contains log for `trainer`. Model description and snapshot is stored in `pass-0000*`.
+
+### Kubernetes Cluster Tear Down
+
+#### Delete EFS
+
+Go to [EFS Console](https://us-west-2.console.aws.amazon.com/efs/home?region=us-west-2) and delete the EFS volumn that we created.
+
+#### Delete security group
+
+Go to [Security Group Console](https://us-west-2.console.aws.amazon.com/ec2/v2/home?region=us-west-2#SecurityGroups:sort=groupId) and delete security group `paddle-efs`.
+
+
+#### Delete S3 Bucket
+
+Go to [S3 Console](https://console.aws.amazon.com/s3/home?region=us-west-2#) and delete the S3 bucket that we created.
+
+#### Destroy Cluster
+
+```
+kube-aws destroy
+```
+
+The command will return immediately, but it might take 5 min to tear down the whole cluster.
+
+You can go to [CludFormation Console](https://us-west-2.console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks?filter=active) to check destroy process.
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..6278dacb17a378da660b2f5434247efd41c995fc
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
@@ -0,0 +1,75 @@
+# Kubernetes 简介
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
+
+# 部署Kubernetes集群
+
+Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
+
+- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
+- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
+- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
+- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
+
+可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
+
+# 选择存储方案
+
+容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
+常见的可选存储服务包括：
+
+- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
+- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
+- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
+- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
+
+# 配置kubectl
+
+## 安装kubectl
+```
+# OS X
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
+
+# Linux
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
+
+# Windows
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
+```
+
+## 配置kubectl访问你的kubernetes集群
+
+编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
+```
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority: /path/to/ca.crt
+    server: https://[Master-IP]:443
+  name: minikube
+contexts:
+- context:
+    cluster: minikube
+    user: minikube
+  name: minikube
+current-context: minikube
+kind: Config
+preferences: {}
+users:
+- name: minikube
+  user:
+    client-certificate: /path/to/apiserver.crt
+    client-key: /Users/wuyi/.minikube/apiserver.key
+```
diff --git a/doc/howto/usage/cluster/k8s/k8s_cn.md b/doc/howto/usage/k8s/k8s_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_cn.md
rename to doc/howto/usage/k8s/k8s_cn.md
index 2575701053ca12cc3af45682af6cd682a88bb987..ab07cb9cd5b135ddea82b3360720537f1dc5a801 100644
--- a/doc/howto/usage/cluster/k8s/k8s_cn.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 单机训练
+# Kubernetes单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
similarity index 51%
rename from doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/k8s/k8s_distributed_cn.md
index 53d0b4676c6a3a2dc8c58e231756638cc0b67765..2a7a6c8c17882a6f2c95e933e051c4b8f1a8eeee 100644
--- a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -1,180 +1,97 @@
-# Kubernetes 分布式训练
+# Kubernetes分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
-## Kubernetes 基本概念
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、 扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
 
 ## 整体方案
 
-### 部署Kubernetes集群
-
-首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
-
-![paddle on kubernetes结构图](k8s-paddle-arch.png)
-
-上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
-
-### 使用 Job
-
-我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业，在作业完成后，Kubernetes会销毁job产生的容器并且释放相关资源。
-
-在Kubernetes中，可以通过编写一个YAML文件，来描述这个job，在这个文件中，主要包含了一些配置信息，例如PaddlePaddle的节点个数，`paddle pserver`开放的端口个数与端口号，使用的网卡设备等，这些信息通过环境变量的形式传递给容器内的程序使用。
+在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
-在一次分布式训练中，用户确定好本次训练需要的PaddlePaddle节点个数，将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件，提交给Kubernetes集群创建并开始作业。
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
-### 创建PaddlePaddle节点
+上图描述了一个3节点的分布式训练场景，在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
-当Kubernetes master收到请求，解析完YAML文件后，会创建出多个pod(个数为PaddlePaddle节点数)，Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点，当pod被成功分配到一台物理/虚拟机上后，Kubernetes会启动pod内的容器，这个容器会根据YAML文件中的环境变量，启动`paddle pserver`与`paddle train`进程。
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，按照下面步骤即可：
 
-### 启动训练
-
-在容器启动后，会通过脚本来启动这次分布式训练，我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id，由于PaddlePaddle本身不提供类似服务发现的功能，所以在本文的启动脚本中，每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
-
-根据这些pod信息，就可以通过某种方式，为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序，将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下：
-
-  1. 查询Kubernetes apiserver获取pod信息，根据IP分配trainer_id
-  1. 从MFS共享目录中拷贝训练文件到容器内
-  1. 根据环境变量，解析出`paddle pserver`与`paddle train`的启动参数，启动进程
-  1. 训练时，PaddlePaddle会自动将结果保存在trainer_id为0的节点上，将输出路径设置为MFS目录，保存输出的文件
-
-
-## 搭建过程
-
-根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，主要分为以下几个步骤：
-
-1. 制作PaddlePaddle镜像
-1. 将训练文件与切分好的数据上传到共享存储
-1. 编写本次训练的YAML文件，创建一个Kubernetes job
-1. 训练结束后查看输出结果
+1. [制作PaddlePaddle镜像](#制作镜像)
+1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
+1. [编写本次训练的YAML文件，创建一个Kubernetes job](#创建Job)
+1. [训练结束后查看输出结果](#查看输出)
 
 下面就根据这几个步骤分别介绍。
 
-
 ### 制作镜像
 
 PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
 
 - 拷贝训练文件到容器内
-
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。镜像的*Dockerfile*如下：
-
-```Dockerfile
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER zjsxzong89@gmail.com
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
 
-COPY start.sh /root/
-COPY start_paddle.py /root/
-CMD ["bash"," -c","/root/start.sh"]
-```
-
-[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内，然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
-
-`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
-
-```python
-parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-    podlist = getPodList()
-```
-
-然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
-
-```python
-    podlist = getPodList()
-    # need to wait until all pods are running
-    while not isPodAllRunning(podlist):
-        time.sleep(10)
-        podlist = getPodList()
-    idMap = getIdMap(podlist)
+```bash
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
 ```
 
-在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+然后将构建成功的镜像上传到镜像仓库。
 
-```python
-def getIdMap(podlist):
-    '''
-    generate tainer_id by ip
-    '''
-    ips = []
-    for pod in podlist["items"]:
-        ips.append(pod["status"]["podIP"])
-    ips.sort()
-    idMap = {}
-    for i in range(len(ips)):
-        idMap[ips[i]] = i
-    return idMap
+```bash
+docker push  [YOUR_REPO]/paddle:mypaddle
 ```
 
-在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
 
-在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+### 准备训练数据
 
-```python
-    program = 'paddle train'
-    args = " --nics=" + PADDLE_NIC
-    args += " --port=" + str(PADDLE_PORT)
-    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for ip in idMap.keys():
-        ip_string += (ip + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    localIP = socket.gethostbyname(socket.gethostname())
-    trainerId = idMap[localIP]
-    args += " " + args_ext + " --trainer_id=" + \
-        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
-```
+这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据，也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image.
 
-使用 `docker build` 构建镜像：
+在启动Job之前，需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下.
 
-```bash
-docker build -t your_repo/paddle:mypaddle .
-```
-
-然后将构建成功的镜像上传到镜像仓库。
-
-```bash
-docker push  your_repo/paddle:mypaddle
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddledev/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
 ```
 
-### 上传训练文件
-
-本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
-
-```bash
-[root@paddle-kubernetes-node0 mfs]# tree -d
+完成后volume中的文件内容大致如下：
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
 .
-└── paddle-cluster-job
-    ├── data
-    │   ├── 0
-    │   │
-    │   ├── 1
-    │   │
-    │   └── 2
-    ├── output
-    └── recommendation
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
 ```
 
 目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
@@ -203,7 +120,7 @@ spec:
           path: /home/work/mfs
       containers:
       - name: trainer
-        image: your_repo/paddle:mypaddle
+        image: [YOUR_REPO]/paddle:mypaddle
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -234,15 +151,18 @@ spec:
 
 `env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
 
-`JOB_PATH`表示共享存储挂载的路径，`JOB_NAME`表示job名字，`TRAIN_CONFIG_DIR`表示本次训练文件所在目录，这三个变量组合就可以找到本次训练需要的文件路径。
-
-`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数，即网卡名
+环境变量 | 说明
+--- | ---
+JOB_PATH | 共享存储挂在的路径
+JOB_NAME | Job的名字
+TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
+CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
+CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
+CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
 
-`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数，`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量，也就是`--ports_num`参数。
-
-`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量，也就是`--ports_num_for_sparse`参数。
-
-`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
+这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
 
 编写完YAML文件后，可以使用Kubernetes的命令行工具创建job。
 
@@ -285,8 +205,8 @@ I1116 09:10:17.123121    50 Util.cpp:155] commandline:
     --ports_num=2 --comment=paddle_process_by_paddle
     --pservers=192.168.129.66,192.168.223.143,192.168.129.71
     --ports_num_for_sparse=2 --config=./trainer_config.py
-    --trainer_count=4 --num_passes=10 --use_gpu=0 
-    --log_period=50 --dot_period=10 --saving_period=1 
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
     --local=0 --trainer_id=0
     --save_dir=/home/jobpath/paddle-cluster-job/output
 I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
@@ -306,3 +226,90 @@ I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:
 I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
 I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
 ```
+
+
+## 一些细节的补充
+
+### 使用环境变量
+
+使用容器方式运行训练任务的Kubernetes Job，通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本，将环境变量转换成paddle的命令行参数：
+```
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Pod间通信
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+* *注意*: `getPodList()`会获取当前namespace下的所有pod，如果已经有pod运行，可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+### 启动任务
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/k8s/k8s_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c3ab05b708e7a924577c26496b8c55126e76c62
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_en.md
@@ -0,0 +1,201 @@
+# Paddle On Kubernetes
+
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+  
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start Paddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/howto/usage/cluster/k8s/Dockerfile b/doc/howto/usage/k8s/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/k8s/Dockerfile
rename to doc/howto/usage/k8s/src/Dockerfile
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..bd34f46c9b0ada7027fd53e553e7d033255d25fc
Binary files /dev/null and b/doc/howto/usage/k8s/src/add_security_group.png differ
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5f1526033d1daf401700989af1d25919bcb7675
Binary files /dev/null and b/doc/howto/usage/k8s/src/create_efs.png differ
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/k8s/src/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/howto/usage/k8s/src/efs_mount.png differ
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..2183a232ad402b76f82a67234a5c93e13ce97ac3
Binary files /dev/null and b/doc/howto/usage/k8s/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_data/Dockerfile b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..6d3a12ae393aa594b8e6e9a5f726109426937284
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
@@ -0,0 +1,7 @@
+FROM alpine
+
+RUN apk update && apk upgrade && apk add coreutils
+ADD quick_start /quick_start
+ADD get_data.sh /bin/
+RUN chmod +x /bin/get_data.sh
+ENTRYPOINT ["/bin/get_data.sh"]
diff --git a/doc/howto/usage/k8s/src/k8s_data/README.md b/doc/howto/usage/k8s/src/k8s_data/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..83cef7affd0ac4d3a1ca08ea5b046fa81e1bc630
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
@@ -0,0 +1,6 @@
+To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
+
+```
+cp -r ../../../../../../demo/quick_start .
+docker build . -t prepare-data-image-name
+```
diff --git a/doc/howto/usage/k8s/src/k8s_data/get_data.sh b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d187ba5ac8d03f69dfdefd4f63610ed7921575be
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
@@ -0,0 +1,26 @@
+#!/bin/sh
+
+out_dir=$OUT_DIR
+split_count=$SPLIT_COUNT
+
+set -e
+
+mkdir -p $out_dir
+cp -r /quick_start $out_dir/
+
+mkdir -p $out_dir/0/data
+cd $out_dir/0/data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+tar zxvf preprocessed_data.tar.gz
+rm preprocessed_data.tar.gz
+
+split -d --number=l/$split_count -a 5 train.txt train.
+mv train.00000 train.txt
+
+cd $out_dir
+end=$(expr $split_count - 1)
+for i in $(seq 1 $end); do
+    mkdir -p $i/data
+    cp -r 0/data/* $i/data
+    mv $i/data/train.`printf %05d $i` $i/data/train.txt
+done;
diff --git a/doc/howto/usage/k8s/src/k8s_train/Dockerfile b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..c0fca1f9a945921e6e8899fee2db8845e66136a1
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddledev/paddle:cpu-latest
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+RUN chmod +x /root/start.sh
+CMD ["bash"," -c","/root/start.sh"]
diff --git a/doc/howto/usage/k8s/src/k8s_train/README.md b/doc/howto/usage/k8s/src/k8s_train/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..96bf65497ffa23e90c4c9350504f86367b48daf2
--- /dev/null
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
@@ -0,0 +1,5 @@
+To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
+
+```
+docker build . -t train-image-name
+```
diff --git a/doc/howto/usage/cluster/k8s/start.sh b/doc/howto/usage/k8s/src/k8s_train/start.sh
similarity index 55%
rename from doc/howto/usage/cluster/k8s/start.sh
rename to doc/howto/usage/k8s/src/k8s_train/start.sh
index b3a1334174a20b018d35de3b01b149fc5b10d49d..12dfe1e6386885a6989d3887f21c6922f137a9ae 100755
--- a/doc/howto/usage/cluster/k8s/start.sh
+++ b/doc/howto/usage/k8s/src/k8s_train/start.sh
@@ -1,19 +1,19 @@
 #!/bin/sh
+
 set -eu
 
 jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
 cd /root
-cp -rf $jobconfig .
-cd $TRAIN_CONFIG_DIR
-
+cp -rf $jobconfig/* .
 
 python /root/start_paddle.py \
   --dot_period=10 \
-  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
   --log_period=50 \
   --num_passes=10 \
-  --trainer_count=4 \
+  --trainer_count=$TRAINER_COUNT \
   --saving_period=1 \
   --local=0 \
-  --config=./trainer_config.py \
+  --config=trainer_config.lr.py \
   --use_gpu=0
diff --git a/doc/howto/usage/cluster/k8s/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
similarity index 83%
rename from doc/howto/usage/cluster/k8s/start_paddle.py
rename to doc/howto/usage/k8s/src/k8s_train/start_paddle.py
index df00d82919faa2acecc79c28e3d773ba3de9672a..935c12bb67e1fe08bc135a7a2220fcd43c548482 100755
--- a/doc/howto/usage/cluster/k8s/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
@@ -23,7 +23,6 @@ import argparse
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
 JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
-JOB_PATH_DATA = JOB_PATH + "/data"
 JOB_PATH_OUTPUT = JOB_PATH + "/output"
 JOBNAME = os.getenv("JOB_NAME")
 NAMESPACE = os.getenv("JOB_NAMESPACE")
@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
 PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
 PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
 
+tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+
 
 def refine_unknown_args(cmd_args):
     '''
@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
     for pod in podlist["items"]:
         if pod["status"]["phase"] == "Running":
             running += 1
+    print "waiting for pods running, require:", require, "running:", running
     if require == running:
         return True
     return False
@@ -79,8 +81,17 @@ def getPodList():
 
     pod = API + NAMESPACE + "/pods?"
     job = JOBNAME
-    return requests.get(apiserver + pod + JOBSELECTOR + job,
-                        verify=False).json()
+    if os.path.isfile(tokenpath):
+        tokenfile = open(tokenpath, mode='r')
+        token = tokenfile.read()
+        Bearer = "Bearer " + token
+        headers = {"Authorization": Bearer}
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            headers=headers,
+                            verify=False).json()
+    else:
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            verify=False).json()
 
 
 def getIdMap(podlist):
@@ -121,9 +132,10 @@ def startPaddle(idMap={}, train_args_dict=None):
     logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
     if not os.path.exists(JOB_PATH_OUTPUT):
         os.makedirs(JOB_PATH_OUTPUT)
-    os.mkdir(logDir)
-    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
-        "/" + str(trainerId) + " ./data"
+    if not os.path.exists(logDir):
+        os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH + \
+        "/" + str(trainerId) + "/data/*" + " ./data/"
     os.system(copyCommand)
     startPserver = 'nohup paddle pserver' + \
         " --port=" + str(PADDLE_PORT) + \
@@ -136,9 +148,9 @@ def startPaddle(idMap={}, train_args_dict=None):
     print startPserver
     os.system(startPserver)
     # wait until pservers completely start
-    time.sleep(10)
-    startTrainer = program + args + " > " + \
-        logDir + "/train.log 2>&1 < /dev/null"
+    time.sleep(20)
+    startTrainer = program + args + " 2>&1 | tee " + \
+        logDir + "/train.log"
     print startTrainer
     os.system(startTrainer)
 
@@ -152,7 +164,7 @@ if __name__ == '__main__':
     podlist = getPodList()
     # need to wait until all pods are running
     while not isPodAllRunning(podlist):
-        time.sleep(10)
+        time.sleep(20)
         podlist = getPodList()
     idMap = getIdMap(podlist)
     startPaddle(idMap, train_args_dict)
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/k8s/src/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/howto/usage/k8s/src/managed_policy.png differ
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/k8s/src/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/howto/usage/k8s/src/pserver_and_trainer.png differ
diff --git a/doc/howto/usage/k8s/src/route53_create_recordset.png b/doc/howto/usage/k8s/src/route53_create_recordset.png
new file mode 100644
index 0000000000000000000000000000000000000000..34e476c7beac30fcdde13fccc4cc8d08b4be3d35
Binary files /dev/null and b/doc/howto/usage/k8s/src/route53_create_recordset.png differ
diff --git a/doc/howto/usage/k8s/src/route53_create_zone.png b/doc/howto/usage/k8s/src/route53_create_zone.png
new file mode 100644
index 0000000000000000000000000000000000000000..25b7ddb831c5cba97f4b2edddd27da3234d621af
Binary files /dev/null and b/doc/howto/usage/k8s/src/route53_create_zone.png differ
diff --git a/doc/howto/usage/k8s/src/worker_security_group.png b/doc/howto/usage/k8s/src/worker_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..57eb0265a34ad4223b69600d2a3dd355482e0bf5
Binary files /dev/null and b/doc/howto/usage/k8s/src/worker_security_group.png differ
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 418d718fbd9c61bff3acb9c2dab0638c0b650bab..6dc48704bc230bd1a573c4b4b2e7c07791e48ced 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -15,13 +15,19 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e96c25cb75bee20d2e2949423d80ddab1d3450a1..b477f0120c4fa0544012080b7cfb8572d3c44b04 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -15,14 +15,20 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
+
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
-
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe800308d8d7a03619ec8e13fd8dc4aa7a8ed8be
--- /dev/null
+++ b/doc/tutorials/embedding_model/index_cn.md
@@ -0,0 +1,138 @@
+# 中文词向量模型的使用 #
+----------
+本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
+
+在此感谢 @lipeng 提出的代码需求，并给出的相关模型格式的定义。
+
+## 介绍 ###
+### 中文字典 ###
+我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206325个词和3个特殊标记：
+  - `<s>`: 分词序列的开始
+  - `<e>`: 分词序列的结束
+  - `<unk>`: 未知词
+
+### 中文词向量的预训练模型 ###
+遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法，模型采用 n-gram 语言模型，结构如下图：6元上下文作为输入层->全连接层->softmax层 。对应于字典，我们预训练得到4种不同维度的词向量，分别为：32维、64维、128维和256维。
+<center>![](./neural-n-gram-model.png)</center>
+<center>Figure 1. neural-n-gram-model</center>
+
+### 下载和数据抽取 ###
+运行以下的命令下载和获取我们的字典和预训练模型：
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    ./pre_DictAndModel.sh
+
+## 中文短语改写的例子 ##
+以下示范如何使用预训练的中文字典和词向量进行短语改写。
+
+### 数据的准备和预处理 ###
+首先，运行以下的命令下载数据集。该数据集（utf8编码）包含20个训练样例，5个测试样例和2个生成式样例。
+
+    cd $PADDLE_ROOT/demo/seqToseq/data
+    ./paraphrase_data.sh
+
+第二步，将数据处理成规范格式，在训练数集上训练生成词向量字典（数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`）:
+
+    cd $PADDLE_ROOT/demo/seqToseq/
+    python preprocess.py -i data/paraphrase [--mergeDict]
+
+- 其中，如果使用`--mergeDict`选项，源语言短语和目标语言短语的字典将被合并（源语言和目标语言共享相同的编码字典）。本实例中，源语言和目标语言都是相同的语言，因此可以使用该选项。
+
+
+### 使用用户指定的词向量字典 ###
+使用如下命令，从预训练模型中，根据用户指定的字典，抽取对应的词向量构成新的词表:
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
+
+- `--preModel PREMODEL`: 预训练词向量字典模型的路径
+- `--preDict PREDICT`:  预训练模型使用的字典的路径
+- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
+- `--usrDict USRDICT`: 用户指定新的字典的路径，用于构成新的词表
+- `-d DIM`: 参数（词向量）的维度
+
+此处，你也可以简单的运行以下的命令：
+
+    cd $PADDLE_ROOT/demo/seqToseq/data/
+    ./paraphrase_model.sh
+
+运行成功以后，你将会看到以下的模型结构：
+
+    paraphrase_model
+    |--- _source_language_embedding
+    |--- _target_language_embedding
+
+### 在PaddlePaddle平台训练模型 ###
+首先，配置模型文件，配置如下（可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置）:
+
+    from seqToseq_net import *
+    is_generating = False
+
+    ################## Data Definition #####################
+    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
+                                 job_mode = job_mode)
+
+    ############## Algorithm Configuration ##################
+    settings(
+          learning_method = AdamOptimizer(),
+          batch_size = 50,
+          learning_rate = 5e-4)
+
+    ################# Network configure #####################
+    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
+
+这个配置与`demo/seqToseq/translation/train.conf` 基本相同
+
+然后，使用以下命令进行模型训练:
+
+    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
+    ./train.sh
+
+其中，`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同，只有2个配置不一样:
+
+- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
+- `--load_missing_parameter_strategy`：如果参数模型文件缺失，除词向量模型外的参数将使用正态分布随机初始化
+
+如果用户想要了解详细的数据集的格式、模型的结构和训练过程，请查看 [Text generation Tutorial](../text_generation/index_cn.md).
+
+## 可选功能 ##
+###  观测词向量
+PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
+
+- `-i INPUT`: 输入的（二进制）词向量模型名称
+- `-o OUTPUT`: 输出的文本模型名称
+- `-d DIM`: （词向量）参数维度
+
+运行完以上命令，用户可以在输出的文本模型中看到:
+
+    0,4,32156096
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+
+- 其中，第一行是`PaddlePaddle` 输出文件的格式说明，包含3个属性：:
+  - `PaddlePaddle`的版本号，本例中为0
+  - 浮点数占用的字节数，本例中为4
+  - 总计的参数个数，本例中为32,156,096
+- 其余行是（词向量）参数行（假设词向量维度为32）
+  - 每行打印32个参数以','分隔
+  - 共有32,156,096/32 = 1,004,877行，也就是说，模型共包含1,004,877个被向量化的词
+
+### 词向量模型的修正
+`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --t2b -i INPUT -o OUTPUT
+
+- `-i INPUT`: 输入的文本词向量模型名称
+- `-o OUTPUT`: 输出的二进制词向量模型名称
+
+请注意，输入的文本格式如下:
+
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+- 输入文本中没有头部（格式说明）行
+- （输入文本）每行存储一个词，以逗号','分隔
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
index 001ed6cc19e8911f9b10f63211c9658160b3a06e..0eafd7cb49b545f412f8e775804bcd0b22c42454 100644
Binary files a/doc/tutorials/gan/gan.png and b/doc/tutorials/gan/gan.png differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
index 99c8d730117a469c89abb218eeacf66103c0cbed..ac9ed37b2264778869f92c0910b1cb946fb4427f 100644
--- a/doc/tutorials/gan/index_en.md
+++ b/doc/tutorials/gan/index_en.md
@@ -4,9 +4,7 @@ This demo implements GAN training described in the original [GAN paper](https://
 
 The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
 
-<p align="center">
-    <img src="./gan.png" width="500" height="300"> 
-</p>
+<center>![](./gan.png)</center>
 <p align="center">
     Figure 1. GAN-Model-Structure
     <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
@@ -111,9 +109,7 @@ $python gan_trainer.py -d uniform --useGpu 1
 ```
 The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
 
-<p align="center">
-    <img src="./uniform_sample.png" width="300" height="300"> 
-</p>
+<center>![](./uniform_sample.png)</center>
 <p align="center">
     Figure 2. Uniform Sample
 </p>
@@ -135,9 +131,7 @@ To train the GAN model on mnist data, one can use the following command:
 $python gan_trainer.py -d mnist --useGpu 1
 ```
 The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
-<p align="center">
-    <img src="./mnist_sample.png" width="300" height="300"> 
-</p>
+<center>![](./mnist_sample.png)</center>
 <p align="center">
     Figure 3. MNIST Sample
 </p>
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
index 4a96c45cae82673f5a1df986f2643a8026da7937..e716c48e782019a757bed0cb443f2ed97386cbe2 100644
Binary files a/doc/tutorials/gan/uniform_sample.png and b/doc/tutorials/gan/uniform_sample.png differ
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
index 97014d537655d21871295699381c5dd2106d0b56..6a27004d58d24cc466d930322be8cdbb2f434c74 100644
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
@@ -2,6 +2,7 @@
 
 * [快速入门](quick_start/index_cn.rst)
 * [个性化推荐](rec/ml_regression_cn.rst)
+* [图像分类](image_classification/index_cn.md)
 * [情感分析](sentiment_analysis/index_cn.md)
 * [语义角色标注](semantic_role_labeling/index_cn.md)
 * [机器翻译](text_generation/index_cn.md)
@@ -9,3 +10,4 @@
 ## 常用模型
 
 * [ResNet模型](imagenet_model/resnet_model_cn.md)
+* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index cce9d3a176a5e5c87e97c16362ec8a202e8eb80a..77331a703b6f0fdf92921ebcc476325b7327e976 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -7,6 +7,7 @@ There are several examples and demos here.
 * [Sentiment Analysis](sentiment_analysis/index_en.md)
 * [Semantic Role Labeling](semantic_role_labeling/index_en.md)
 * [Text Generation](text_generation/index_en.md)
+* [Image Auto-Generation](gan/index_en.md)
 
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
index 70dec2eb2a8c397bc56b1e6f52a624a3a6877905..ca110431cf921ae0480d3fb2b17c58f90a84cc0e 100644
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -156,14 +156,14 @@ define_py_data_sources2(train_list='data/train.list',
                         obj="process",
                         args={"dictionary": word_dict})
 ```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
+You can refer to the following link for more detailed examples and data formats: <a href = "../../api/v1/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
 
 ## Network Architecture
 We will describe four kinds of network architectures in this section.
 <center> ![](./src/PipelineNetwork_en.jpg) </center>
 
 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
+For more detailed documentation, you could refer to: <a href = "../../api/v1/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
 
 ### Logistic Regression
 The architecture is illustrated in the following picture:
@@ -366,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
 <br>
 
 ## Optimization Algorithm
-<a href = "../../api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+<a href = "../../api/v1/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
 
 ```python
 settings(batch_size=128,
@@ -407,7 +407,7 @@ paddle train \
 --init_model_path=./output/pass-0000x
 ```
 
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/v1/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
 
 inference script (predict.sh)：
 
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 2daea052b01adc87f42e15cdcfec92301b7edae9..503024cff338dac42a6a8a32463472dc6b6451d9 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
+add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 0cafbd896e2d88aee4406bd0305878ce489bc18d..d49b189e253f7a0792fe3f1fe7c8fdbb7071acd4 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
   return args;
 }
 
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
 Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return Matrix::createByPaddleMatrixPtr(&a.value);
@@ -137,6 +144,8 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
   a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }
 
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
+
 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return a.getBatchSize();
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index da6dad10cd807654f9ddd03beeb29cef69fc8de0..6e8fcd114df580a00858d95f0af0d1ec0bd9b4a2 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,3 +1,21 @@
+FUNCTION(generate_python_api target_name)
+    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
+        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
+                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
+                ${external_project_dependencies}
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+        COMMENT "Generate Python API from swig")
+    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
+                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                ${PROJ_ROOT}/paddle/Paddle_wrap.h
+                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                ${external_project_dependencies})
+ENDFUNCTION(generate_python_api)
+
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
@@ -42,7 +60,7 @@ file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
 
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
@@ -76,5 +94,19 @@ add_dependencies(python_api_wheel python_swig_sources
   paddle_cuda)
 
 if(WITH_TESTING)
+    IF(NOT PY_PIP_FOUND)
+        SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
+        ExternalProject_Add(pip
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY      https://github.com/pypa/pip.git
+            GIT_TAG             9.0.1
+            PREFIX              ${PIP_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+            BUILD_IN_SOURCE     1
+            DEPENDS python setuptools python_api_wheel
+        )
+    ENDIF()
     add_subdirectory(test)
 endif()
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index c30e09876397e37ef9ed4ec3200d1aa372ceb609..681e3a380912339c531c16c88f43255c2f34c32f 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -27,3 +27,18 @@ std::string Evaluator::toString() {
   m->rawPtr->printStats(sout);
   return sout.str();
 }
+
+std::vector<std::string> Evaluator::getNames() const {
+  std::vector<std::string> retv;
+  m->rawPtr->getNames(&retv);
+  return retv;
+}
+
+double Evaluator::getValue(const std::string name) const {
+  paddle::Error err;
+  double v = m->rawPtr->getValue(name, &err);
+  if (err) {
+    throw std::runtime_error(err.msg());
+  }
+  return v;
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 66115f8293b905809639afff779abfdb2bb3a54e..dcb5fe086fdccf8ec62ee52cbaaac4b7dbbe2f9d 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -142,14 +142,28 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
   }
 }
 
+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
-Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
     throw(UnsupportError) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
+  auto nn = m->machine;
   if (nn) {
-    auto mat = nn->getLayerOutput(layerName);
-    return Matrix::createByPaddleMatrixPtr(&mat);
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
   } else {
     throw UnsupportError();
   }
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 3365927f9b59936244230bed439808fa7ead2c61..068ba286c07d8854a1a7c7042224a679b50b4957 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -178,6 +178,7 @@ namespace std {
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 09c891871a5ca8571216d211203fe8643fc3a63f..c4f5dca26cc6a5e9fdd23ee27b594ced29a25c7a 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/common.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
@@ -47,6 +47,9 @@ void setUseGpu(bool useGpu);
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
 /// The Error of IO Operation. Such as file not found, etc.
 class IOError {};
 
@@ -450,8 +453,11 @@ public:
                                         IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
+  float sum() const;
+
 private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
 private:
@@ -546,6 +552,10 @@ public:
   ParameterConfig* getConfig();
   void setValueUpdated();
 
+  bool save(const std::string& filename) const;
+
+  bool load(const std::string& filename) const;
+
   size_t getSize() const;
 
 private:
@@ -761,9 +771,12 @@ public:
   size_t getParameterSize() const;
   Parameter* getParameter(size_t i) throw(RangeError);
 
+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
   void randParameters();
 
-  Matrix* getLayerOutput(const std::string& layerName) const
+  Arguments* getLayerOutput(const std::string& layerName) const
       throw(UnsupportError);
 
   /**
@@ -803,6 +816,8 @@ private:
 
 public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount);
   ~ParameterUpdater();
 
   /**
@@ -892,6 +907,10 @@ public:
    */
   std::string toString();
 
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
 private:
   EvaluatorPrivate* m;
 
@@ -944,7 +963,7 @@ public:
 
   Arguments* getForwardOutput();
 
-  Matrix* getLayerOutput(const std::string& layerName);
+  Arguments* getLayerOutput(const std::string& layerName) const;
 };
 
 /// the N-Best results generated from one input sequence.
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index ddc00d8d1af4c58d7e2233423bea916408bee92b..19f7a898d6b8d3d02c5654559dcb86728266731e 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -57,4 +57,12 @@ size_t Parameter::getID() const { return m->getPtr()->getID(); }
 
 void Parameter::setValueUpdated() { m->getPtr()->setValueUpdated(); }
 
+bool Parameter::save(const std::string& filename) const {
+  return m->getPtr()->save(filename);
+}
+
+bool Parameter::load(const std::string& filename) const {
+  return m->getPtr()->load(filename);
+}
+
 size_t Parameter::getSize() const { return m->getPtr()->getSize(); }
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 7cd8ed7e3907489a60f37090df6f51492def2612..75b0ae7cb6cc8c9ad0f8fe69963b7439a44bf55e 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,15 +15,25 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
 
 ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
 
 ParameterUpdater *ParameterUpdater::createLocalUpdater(
     OptimizationConfig *config) {
-  auto param = new ParameterUpdater();
-  param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return param;
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr));
+  return updater;
 }
 
 ParameterUpdater::~ParameterUpdater() { delete m; }
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index d83dc380beeec3747451a483f4811eb833e8c226..84e4ca054abb0100a02c8a40e31c49c17684ef40 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
 void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
 void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
-Matrix* Trainer::getLayerOutput(const std::string& layerName) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-      this->m->getGradientMachine());
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto m = nn->getLayerOutput(layerName);
-  return Matrix::createByPaddleMatrixPtr(&m);
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
 }
 
 void Trainer::forwardOneBatch(size_t batchSize) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index c3f739568f50b6ee8b0894d06a4d7f91c7816879..d369df5d4e04b4a8d822db0e72a8051150868ce6 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-#include <fenv.h>
 #include <algorithm>
 #include <iostream>
 #include <iterator>
@@ -55,5 +54,7 @@ bool isGpuVersion() {
 #endif
 }
 
+int getTrainerCount() { return FLAGS_trainer_count; }
+
 static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
               "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index 23542b952b7699d66cf64b47d0354e9078ae06d9..82f45ba6ccec49eb190d1814a67a575f311689e8 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -1,17 +1,17 @@
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
-ZLIB_LIB="@ZLIB_LIBRARIES@"
+PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
+ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
+GLOG_LIBRARIES="@GLOG_LIBRARIES@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBS@"
+CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
 
-CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index b4d27b1cc728f92b2210f30b69f3f5899fe81d65..ad5dce209bf8e14120320a58c3cd85d6f6a97688 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -40,14 +40,14 @@ try:
             self.paddle_build_dir = PADDLE_BUILD_DIR
             self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
             self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIB
-            self.zlib = ZLIB_LIB
+            self.protolib = PROTOBUF_LIBRARY
+            self.zlib = ZLIB_LIBRARIES
             self.thread = CMAKE_THREAD_LIB
             self.dl_libs = CMAKE_DL_LIBS
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.glog_libs = LIBGLOG_LIBRARY
+            self.glog_libs = GLOG_LIBRARIES
 
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
diff --git a/paddle/api/test/.gitignore b/paddle/api/test/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..b7948824a1eab119140dd9bea20276c303fe4af1
--- /dev/null
+++ b/paddle/api/test/.gitignore
@@ -0,0 +1,2 @@
+*.w0
+*.wbias
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 08a0fe96a004d38b81d0bac881da1faeb52685f4..a2fa623c80087d42e6a2a5c05f62eba4997f8ec4 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_test(NAME test_swig_api
-    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
+    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh ${PYTHON_EXECUTABLE})
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index 2f12ba026430ba7adb6f4dee11ed17ea3ad3f36d..bcf06afa86aaa1a3151aeb966b54f69657c541e3 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 
 cd $SCRIPTPATH
 
-rm -rf .test_env
-virtualenv .test_env
-source .test_env/bin/activate
-
-pip --timeout 600  install ../../dist/*.whl
+$1 -m pip install ../../dist/*.whl
 
 test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
 
@@ -33,7 +29,7 @@ export PYTHONPATH=$PWD/../../../python/
 for fn in $test_list
 do
   echo "test $fn"
-  python $fn
+  $1 $fn
   if [ $? -ne 0 ]; then
     exit 1
   fi
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index 8cabecd242fb4eb98c0fe468687ef179245e4535..9fe44de94ea6ddb71d2dfbb2243fc86ede0d0531 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,6 +22,8 @@ class TestArguments(unittest.TestCase):
         args = swig_paddle.Arguments.createArguments(1)
         args.setSlotValue(0, m)
 
+        self.assertAlmostEqual(27.0, args.sum())
+
         mat = args.getSlotValue(0)
         assert isinstance(mat, swig_paddle.Matrix)
         np_mat = mat.toNumpyMatInplace()
diff --git a/paddle/api/test/testGradientMachine.py b/paddle/api/test/testGradientMachine.py
index b81eafa9673ca34f1b7e06401098d55bdb1b35a5..4b705f66eccd267f326fe0662a17b33a09fda982 100644
--- a/paddle/api/test/testGradientMachine.py
+++ b/paddle/api/test/testGradientMachine.py
@@ -45,6 +45,7 @@ class TestGradientMachine(unittest.TestCase):
             assert isinstance(val, swig_paddle.Vector)
             arr = numpy.full((len(val), ), 0.1, dtype="float32")
             val.copyFromNumpyArray(arr)
+            self.assertTrue(param.save(param.getName()))
             param_config = param.getConfig().toProto()
             assert isinstance(param_config,
                               paddle.proto.ParameterConfig_pb2.ParameterConfig)
@@ -92,6 +93,9 @@ class TestGradientMachine(unittest.TestCase):
 
         self.assertTrue(self.isCalled)
 
+        for param in machine.getParameters():
+            self.assertTrue(param.load(param.getName()))
+
     def test_train_one_pass(self):
         conf_file_path = './testTrainConfig.py'
         trainer_config = swig_paddle.TrainerConfig.createFromTrainerConfigFile(
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 37666bdccc9aedfe8f8079124129aad2ade53a43..f08fbf3ccdf5d7c0a5c739868b1bcb516146c23d 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):
 
     def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                          numpy_mat.shape)
 
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index a90d15c272a3a2b56e35c979e053deb2b54eebc1..7061a4c43bf01158b5f084d0c310dedd81773a04 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -89,9 +89,14 @@ def main():
             except Exception as e:
                 print e
 
+        ev = m.makeEvaluator()
+        ev.start()
         m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
                           update_callback)
-
+        m.eval(ev)
+        ev.finish()
+        for name in ev.getNames():
+            print name, ev.getValue(name)
         for optimizer in optimizers:
             optimizer.finishBatch()
 
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 1ab095c1d3d0d2c84d2d2f95a03f172b901de209..6339cf8542607bdda99eb9ccaa8b06480f144b78 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):
 
     def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
         vec[4] = 832
         for i in xrange(len(iv)):
@@ -106,7 +106,7 @@ class TestVector(unittest.TestCase):
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
         assert isinstance(vec, swig_paddle.Vector)
         numpy_arr[0] = 0.1
         for n, v in zip(numpy_arr, vec):
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index aa1ff4a771c4a1c64be86893e7b2261ae65f0f94..a28ccd6f07cfd56b7f1978f67fdcf6e7e5fe6337 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,7 +15,6 @@ else()
 endif()
 
 set(CUDA_CXX_WITH_GPU_SOURCES
-    src/hl_cudart_wrap.cc
     src/hl_cuda_cublas.cc
     src/hl_cuda_cudnn.cc
     src/hl_cuda_device.cc)
@@ -88,6 +87,8 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
+add_dependencies(paddle_cuda ${external_project_dependencies})
+
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 20c13f21e61a92b0635b686f6f724ae2b44518cc..276a07d3c735c771c851e8b4bd14c720f9ab6569 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -36,14 +36,6 @@ void GetCublasDsoHandle(void** dso_handle);
  */
 void GetCudnnDsoHandle(void** dso_handle);
 
-/**
- * @brief    load the DSO of CUDA Run Time
- *
- * @param    **dso_handle   dso handler
- *
- */
-void GetCudartDsoHandle(void** dso_handle);
-
 /**
  * @brief    load the DSO of CURAND
  *
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index abd5eb3a0cf338c689680dd0f7192be7b2530383..eb454c59c1e58cf2b4817b4cb3230b9d75e320ac 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d,
                                         const int* index,
                                         int numSequence);
 
-/**
- * @brief   Matrix classification error.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input vector (M x 1).
- * @param[out]  C_d     output vector (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
-
 /**
  * @brief   Matrix cross entropy.
  *
@@ -188,48 +175,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                         int width,
                                         int height,
                                         int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);
 
 /**
  * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
@@ -267,4 +212,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimN,
                                           real scale);
 
+/**
+ * @brief  Matrix rotation in 90 degrees
+ *
+ * @param[in]   mat       input matrix (M x N).
+ * @param[out]  matRot    output matrix (N x M).
+ * @param[in]   dimM      input matrix height.
+ * @param[in]   dimN      input matrix width.
+ * @param[in]   clockWise rotation direction
+ */
+extern void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9bcd25b0623e569052e08c0befc8e09f937fa4bd..9f9d8f972e3a4c62e5caedcf85054be5681b96c1 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
 extern void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weightData      padding data.
- * @param[out]  output          output sequence.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- * @param[in]   isPadding       trainable padding.
- *
- */
-extern void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding);
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- *
- */
-extern void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart);
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  weightGrad      weight gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   weightDim       input sequence dimension.
- * @param[in]   totalPad        number of extra timesteps.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- *
- */
-extern void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad);
-
 /**
  * @brief   Memory copy from sequence to batch.
  *
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index 77949ed295a6eaf7cc535853e53bef066ffac37c..79ae0d0e741de06e622454ccd220e2c749d795b3 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal,
                                    int beamSize,
                                    int numSamples);
 
-#endif /* HL_TOP_K_H_ */
+/**
+ * @brief   Matrix classification error.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            input value.
+ * @param[in]   lds            leading dimension of src.
+ * @param[in]   dim            width of input value.
+ * @param[in]   topkSize       size of top k element.
+ * @param[in]   numSamples     height of input value.
+ * @param[in]   label          ground truth label.
+ * @param[out]  recResult      top-k classification error.
+ *
+ */
+extern void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult);
+
+#endif  // HL_TOP_K_H_
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
index 79bf6c3db7f876009d98a62b6523588f021886e8..7885ae570148c0b9870089baf22b6bacb786f995 100644
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
 
+#include "ctc.h"
 #include "hl_base.h"
-#include "warp-ctc/include/ctc.h"
 
 typedef ctcStatus_t hl_warpctc_status_t;
 typedef ctcOptions hl_warpctc_options_t;
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index 0b669f6735cb9771fd63ed8e3b45602db0db447c..127cb7e27983e8ff2c1ff6ef5108b5f8c5bd6ca5 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d,
 inline void hl_matrix_softmax_derivative(
     real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
 
-inline void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+inline void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult) {}
 
 inline void hl_matrix_cross_entropy(
     real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
@@ -74,25 +82,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                         int height,
                                         int partial_sum) {}
 
-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                       real* B_d,
                                       const int channel,
@@ -106,4 +95,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
                                           const int dimM,
                                           const int dimN,
                                           real scale) {}
+
+inline void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
+
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index d6b07556f8958a62bd47f0b47b75bbebafeb58d3..05e51bce9e1df6fc6ef1cad891b44a9172da185d 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
 inline void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
-inline void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding) {}
-
-inline void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {}
-
-inline void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {}
-
 inline void hl_sequence2batch_copy(real* batch,
                                    real* sequence,
                                    const int* batchIndex,
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index a71eecba2736234dafaf6b67e5efac5358a30871..6dfb12e00b80db36ad2e53326b880c7d1ed59263 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -22,10 +22,9 @@ limitations under the License. */
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
-#include "hl_cuda.h"
 #include "hl_cuda.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
 // clang-format on
 
@@ -77,78 +76,6 @@ CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
 #undef CURAND_RAND_ROUTINE_EACH
 #undef DYNAMIC_LOAD_CURAND_WRAP
 
-std::once_flag cudart_dso_flag;
-void *cudart_dso_handle = nullptr;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- *
- * note: default dynamic linked libs
- */
-#ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                                       \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    auto operator()(Args... args) -> decltype(__name(args...)) {               \
-      using cudart_func = decltype(__name(args...)) (*)(Args...);              \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudart_func>(p_##__name)(args...);               \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-#else
-#define DYNAMIC_LOAD_CUDART_WRAP(__name)                         \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name; /* struct DynLoad__##__name */
-#endif
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)        \
-  __macro(cudaMalloc)                     \
-  __macro(cudaHostAlloc)                  \
-  __macro(cudaFree)                       \
-  __macro(cudaFreeHost)                   \
-  __macro(cudaMemcpy)                     \
-  __macro(cudaMemset)                     \
-  __macro(cudaMemcpyAsync)                \
-  __macro(cudaSetDevice)                  \
-  __macro(cudaGetDevice)                  \
-  __macro(cudaGetDeviceCount)             \
-  __macro(cudaGetDeviceProperties)        \
-  __macro(cudaDeviceSynchronize)          \
-  __macro(cudaDeviceCanAccessPeer)        \
-  __macro(cudaDeviceEnablePeerAccess)     \
-  __macro(cudaStreamCreate)               \
-  __macro(cudaStreamDestroy)              \
-  __macro(cudaStreamSynchronize)          \
-  __macro(cudaStreamWaitEvent)            \
-  __macro(cudaEventCreate)                \
-  __macro(cudaEventRecord)                \
-  __macro(cudaEventQuery)                 \
-  __macro(cudaEventDestroy)               \
-  __macro(cudaEventSynchronize)           \
-  __macro(cudaEventElapsedTime)           \
-  __macro(cudaSetDeviceFlags)             \
-  __macro(cudaGetLastError)               \
-  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)          \
-  __macro(cudaGetErrorString)             \
-  __macro(cudaProfilerStart)              \
-  __macro(cudaProfilerStop)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#undef CUDA_ROUNTINE_EACH
-#undef DYNAMIC_LOAD_CUDART_WRAP
-
 } /* namespace dynload */
 
 /**
@@ -171,11 +98,11 @@ int g_cuda_lib_version = 0;
  * Check build-in cuda function using glog and it **does not**
  * support << operator for more details error info.
  */
-#define CHECK_CUDA(cudaFunc)                                                  \
-  do {                                                                        \
-    cudaError_t cudaStat = cudaFunc;                                          \
-    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                         \
-                                    << dynload::cudaGetErrorString(cudaStat); \
+#define CHECK_CUDA(cudaFunc)                                         \
+  do {                                                               \
+    cudaError_t cudaStat = cudaFunc;                                 \
+    CHECK_EQ(cudaSuccess, cudaStat) << "Cuda Error: "                \
+                                    << cudaGetErrorString(cudaStat); \
   } while (0)
 
 /**
@@ -284,13 +211,13 @@ void hl_fini() {
       tmp_stream = (char *)t_device[dev]->stream;
     }
     for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-      CHECK_CUDA(dynload::cudaStreamDestroy(t_device[dev]->stream[j]));
+      CHECK_CUDA(cudaStreamDestroy(t_device[dev]->stream[j]));
     }
 
     /* free device memory */
     hl_free_mem_device(t_device[dev]->gpu_mem);
     hl_free_mem_host(t_device[dev]->cpu_mem);
-    CHECK_CUDA(dynload::cudaEventDestroy(t_device[dev]->mem_event));
+    CHECK_CUDA(cudaEventDestroy(t_device[dev]->mem_event));
   }
 
   free(tmp);
@@ -308,7 +235,7 @@ void hl_set_device(int device) {
   CHECK(device >= 0 && device < g_system_device_num && g_device[device])
       << "Device: " << device << " is not specified in startup.";
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* switch thread stream */
   for (int i = 0; i < NUMBER_OF_GLOBAL_STREAM; i++) {
@@ -336,7 +263,7 @@ void hl_set_device(int device) {
 
 int hl_get_device() {
   int device;
-  CHECK_CUDA(dynload::cudaGetDevice(&device));
+  CHECK_CUDA(cudaGetDevice(&device));
   return device;
 }
 
@@ -344,7 +271,7 @@ void *hl_malloc_device(size_t size) {
   void *dest_d;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(dynload::cudaMalloc((void **)&dest_d, size));
+  CHECK_CUDA(cudaMalloc((void **)&dest_d, size));
 
   return dest_d;
 }
@@ -352,7 +279,7 @@ void *hl_malloc_device(size_t size) {
 void hl_free_mem_device(void *dest_d) {
   CHECK_NOTNULL(dest_d);
 
-  cudaError_t err = dynload::cudaFree(dest_d);
+  cudaError_t err = cudaFree(dest_d);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -361,8 +288,7 @@ void *hl_malloc_host(size_t size) {
   void *dest_h;
 
   CHECK(size) << __func__ << ": the size for device memory is 0, please check.";
-  CHECK_CUDA(
-      dynload::cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
+  CHECK_CUDA(cudaHostAlloc((void **)&dest_h, size, cudaHostAllocDefault));
 
   return dest_h;
 }
@@ -370,7 +296,7 @@ void *hl_malloc_host(size_t size) {
 void hl_free_mem_host(void *dest_h) {
   CHECK_NOTNULL(dest_h);
 
-  cudaError_t err = dynload::cudaFreeHost(dest_h);
+  cudaError_t err = cudaFreeHost(dest_h);
   CHECK(cudaSuccess == err || cudaErrorCudartUnloading == err)
       << hl_get_device_error_string();
 }
@@ -381,11 +307,11 @@ void hl_memcpy(void *dst, void *src, size_t size) {
   }
   CHECK_NOTNULL(dst);
   CHECK_NOTNULL(src);
-  CHECK_CUDA(dynload::cudaMemcpy(dst, src, size, cudaMemcpyDefault));
+  CHECK_CUDA(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 }
 
 void hl_memset_device(void *dest_d, int value, size_t size) {
-  CHECK_CUDA(dynload::cudaMemset(dest_d, value, size));
+  CHECK_CUDA(cudaMemset(dest_d, value, size));
 }
 
 void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
@@ -394,7 +320,7 @@ void hl_memcpy_host2device(void *dest_d, void *src_h, size_t size) {
   }
   CHECK_NOTNULL(src_h);
   CHECK_NOTNULL(dest_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_h, size, cudaMemcpyHostToDevice));
 }
 
 void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
@@ -403,7 +329,7 @@ void hl_memcpy_device2host(void *dest_h, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_h);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(dynload::cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
+  CHECK_CUDA(cudaMemcpy(dest_h, src_d, size, cudaMemcpyDeviceToHost));
 }
 
 void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
@@ -412,8 +338,7 @@ void hl_memcpy_device2device(void *dest_d, void *src_d, size_t size) {
   }
   CHECK_NOTNULL(dest_d);
   CHECK_NOTNULL(src_d);
-  CHECK_CUDA(
-      dynload::cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
+  CHECK_CUDA(cudaMemcpy(dest_d, src_d, size, cudaMemcpyDeviceToDevice));
 }
 
 void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
@@ -427,8 +352,7 @@ void hl_memcpy_async(void *dst, void *src, size_t size, hl_stream_t stream) {
   CHECK_LT(stream, HPPL_STREAM_END);
   cu_stream = t_resource.stream[stream];
 
-  CHECK_CUDA(
-      dynload::cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
+  CHECK_CUDA(cudaMemcpyAsync(dst, src, size, cudaMemcpyDefault, cu_stream));
 }
 
 void hl_start() {
@@ -439,8 +363,7 @@ void hl_start() {
 
 bool hl_device_can_access_peer(int device, int peerDevice) {
   int canAccessPeer;
-  CHECK_CUDA(
-      dynload::cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
+  CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessPeer, device, peerDevice));
 
   if (canAccessPeer == 1) {
     return true;
@@ -450,9 +373,9 @@ bool hl_device_can_access_peer(int device, int peerDevice) {
 }
 
 void hl_device_enable_peer_access(int peerDevice) {
-  cudaError_t err = dynload::cudaDeviceEnablePeerAccess(peerDevice, 0);
+  cudaError_t err = cudaDeviceEnablePeerAccess(peerDevice, 0);
   if (cudaErrorPeerAccessAlreadyEnabled == err) {
-    dynload::cudaGetLastError();
+    cudaGetLastError();
   } else {
     CHECK_CUDA(err);
   }
@@ -463,9 +386,9 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   int device = device_prop->device;
   global_device_resources device_res = device_prop->device_resources;
 
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
   /* device properties */
-  CHECK_CUDA(dynload::cudaGetDeviceProperties(&cu_prop, device));
+  CHECK_CUDA(cudaGetDeviceProperties(&cu_prop, device));
 
   device_prop->major = cu_prop.major;
   device_prop->minor = cu_prop.minor;
@@ -474,7 +397,7 @@ void hl_create_global_resources(hl_device_prop device_prop) {
 
   /* create device stream */
   for (int j = 0; j < NUMBER_OF_GLOBAL_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* cublas init */
@@ -501,18 +424,18 @@ void hl_create_global_resources(hl_device_prop device_prop) {
   device_res->gen_mutex = (pthread_mutex_t *)(malloc(sizeof(pthread_mutex_t)));
   pthread_mutex_init(device_res->gen_mutex, NULL);
 
-  CHECK_CUDA(dynload::cudaRuntimeGetVersion(&g_cuda_lib_version));
+  CHECK_CUDA(cudaRuntimeGetVersion(&g_cuda_lib_version));
 }
 
 int hl_get_cuda_version() { return g_cuda_lib_version; }
 
 void hl_create_thread_resources(int device,
                                 thread_device_resources device_res) {
-  CHECK_CUDA(dynload::cudaSetDevice(device));
+  CHECK_CUDA(cudaSetDevice(device));
 
   /* create thread stream */
   for (int j = 0; j < NUMBER_OF_THREAD_STREAM; j++) {
-    CHECK_CUDA(dynload::cudaStreamCreate(&device_res->stream[j]));
+    CHECK_CUDA(cudaStreamCreate(&device_res->stream[j]));
   }
 
   /* allocation device memory */
@@ -521,14 +444,14 @@ void hl_create_thread_resources(int device,
   /* allocation host memory */
   device_res->cpu_mem = (real *)hl_malloc_host(HPPL_GPU_MEMORY_SIZE);
 
-  CHECK_CUDA(dynload::cudaEventCreate(&device_res->mem_event));
+  CHECK_CUDA(cudaEventCreate(&device_res->mem_event));
 }
 
 void hl_specify_devices_start(int *device, int number) {
   if (hl_start_flag) return;
 
   /* 1. get the number of devices */
-  CHECK_CUDA(dynload::cudaGetDeviceCount(&g_system_device_num));
+  CHECK_CUDA(cudaGetDeviceCount(&g_system_device_num));
   CHECK_NE(g_system_device_num, 0) << "[Start failed] there is no GPU device";
   if (device == NULL) {
     number = g_system_device_num;
@@ -640,7 +563,7 @@ void hl_stream_synchronize(hl_stream_t stream) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamSynchronize(cu_stream));
+  CHECK_CUDA(cudaStreamSynchronize(cu_stream));
 }
 
 void hl_create_event(hl_event_t *event) {
@@ -649,7 +572,7 @@ void hl_create_event(hl_event_t *event) {
   struct _hl_event_st *st_event =
       (struct _hl_event_st *)malloc(sizeof(struct _hl_event_st));
 
-  CHECK_CUDA(dynload::cudaEventCreate(&st_event->cu_event));
+  CHECK_CUDA(cudaEventCreate(&st_event->cu_event));
 
   *event = st_event;
 }
@@ -659,8 +582,7 @@ float hl_event_elapsed_time(hl_event_t start, hl_event_t end) {
   CHECK_NOTNULL(start);
   CHECK_NOTNULL(end);
 
-  CHECK_CUDA(
-      dynload::cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
+  CHECK_CUDA(cudaEventElapsedTime(&time, start->cu_event, end->cu_event));
   return time;
 }
 
@@ -672,7 +594,7 @@ void hl_stream_record_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaEventRecord(event->cu_event, cu_stream));
+  CHECK_CUDA(cudaEventRecord(event->cu_event, cu_stream));
 }
 
 void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
@@ -683,12 +605,12 @@ void hl_stream_wait_event(hl_stream_t stream, hl_event_t event) {
                                     << ": the parameter stream is error.";
 
   cu_stream = t_resource.stream[stream];
-  CHECK_CUDA(dynload::cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
+  CHECK_CUDA(cudaStreamWaitEvent(cu_stream, event->cu_event, 0));
 }
 
 void hl_destroy_event(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventDestroy(event->cu_event));
+  CHECK_CUDA(cudaEventDestroy(event->cu_event));
 
   free(event);
   event = NULL;
@@ -696,7 +618,7 @@ void hl_destroy_event(hl_event_t event) {
 
 void hl_event_synchronize(hl_event_t event) {
   CHECK_NOTNULL(event);
-  CHECK_CUDA(dynload::cudaEventSynchronize(event->cu_event));
+  CHECK_CUDA(cudaEventSynchronize(event->cu_event));
 }
 
 void hl_get_device_name(char *name, int len, int device) {
@@ -725,24 +647,24 @@ void hl_get_device_compute_capability(int *major, int *minor, int device) {
   *minor = g_device[device]->minor;
 }
 
-int hl_get_device_last_error() { return (int)dynload::cudaGetLastError(); }
+int hl_get_device_last_error() { return (int)cudaGetLastError(); }
 
 const char *hl_get_device_error_string() {
-  cudaError_t err = dynload::cudaGetLastError();
-  return dynload::cudaGetErrorString(err);
+  cudaError_t err = cudaGetLastError();
+  return cudaGetErrorString(err);
 }
 
 const char *hl_get_device_error_string(size_t err) {
-  return dynload::cudaGetErrorString((cudaError_t)err);
+  return cudaGetErrorString((cudaError_t)err);
 }
 
-void hl_device_synchronize() { CHECK_CUDA(dynload::cudaDeviceSynchronize()); }
+void hl_device_synchronize() { CHECK_CUDA(cudaDeviceSynchronize()); }
 void hl_set_device_flags_block() {
-  CHECK_CUDA(dynload::cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
+  CHECK_CUDA(cudaSetDeviceFlags(cudaDeviceScheduleBlockingSync));
 }
 
 bool hl_cuda_event_is_ready(hl_event_t event) {
-  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
+  cudaError_t err = cudaEventQuery(event->cu_event);
   CHECK(cudaSuccess == err || cudaErrorNotReady == err);
 
   if (cudaErrorNotReady == err) {
@@ -751,6 +673,6 @@ bool hl_cuda_event_is_ready(hl_event_t event) {
   return true;
 }
 
-void hl_profiler_start() { CHECK_CUDA(dynload::cudaProfilerStart()); }
+void hl_profiler_start() { CHECK_CUDA(cudaProfilerStart()); }
 
-void hl_profiler_end() { CHECK_CUDA(dynload::cudaProfilerStop()); }
+void hl_profiler_end() { CHECK_CUDA(cudaProfilerStop()); }
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 2b4c6f7c39cff78c0e76cc1dfd41e1c7ef334f11..9bcc7fb7de44b2211db450fb164655f7947dcad9 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d,
   CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
 
-template<int blockSize>
-__global__ void KeMatrixClassificationError(real* in_A,
-                                            int* in_B,
-                                            real* out_C,
-                                            int dimN) {
-  __shared__ real max_s[blockSize];
-  __shared__ int max_l[blockSize];
-  const int tid = threadIdx.x;
-  const int rowId = blockIdx.x;
-
-  max_s[tid] = -1e30f;
-  in_A += rowId * dimN;
-  real tmp;
-  for (int colId = tid; colId < dimN; colId += blockSize) {
-    tmp = in_A[colId];
-    if (max_s[tid] < tmp) {
-      max_s[tid] = tmp;
-      max_l[tid] = colId;
-    }
-  }
-  __syncthreads();
-
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      if (max_s[tid] < max_s[tid + stride]) {
-        max_s[tid] = max_s[tid + stride];
-        max_l[tid] = max_l[tid + stride];
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
-  }
-}
-
-void hl_matrix_classification_error(real* A_d,
-                                    int* B_d,
-                                    real* C_d,
-                                    int dimM,
-                                    int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  // each sample is calculated by one block
-  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
-    (A_d, B_d, C_d, dimN);
-  CHECK_SYNC("hl_matrix_classification_error");
-}
-
 __global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
                                                 real* entropy,
                                                 int* row,
@@ -584,177 +531,6 @@ void hl_param_relu_backward_diff(real* grad_o,
   CHECK_SYNC("hl_param_relu_backward_diff failed");
 }
 
-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                       real* B,
                                       const int channel,
@@ -840,3 +616,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
       (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
   CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
+
+__global__ void keMatrixRotate(real* mat, real* matRot,
+                               int dimM, int dimN, bool clockWise) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < dimM * dimN) {
+        int i = idx / dimN;
+        int j = idx % dimN;
+        if (clockWise) {
+            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+        } else {
+            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+        }
+    }
+}
+
+void hl_matrix_rotate(real *mat, real* matRot,
+                      int dimM, int dimN, bool clockWise) {
+    CHECK_NOTNULL(mat);
+    CHECK_NOTNULL(matRot);
+    const int threads = 512;
+    const int blocks = DIVUP(dimM * dimN, threads);
+    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+            (mat, matRot, dimM, dimN, clockWise);
+    CHECK_SYNC("hl_matrix_rotate failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 4e33ac443c1f78b7fa50a15784875cbadfcf7497..ba823de2720336851bf9c49d8162360af93e8601 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
   CHECK_SYNC("hl_max_sequence_backward failed");
 }
 
-template <bool padding>
-__global__ void KeContextProjectionForward(real* input,
-                                           const int* sequence,
-                                           real* weightData,
-                                           real* output,
-                                           int inputDim,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  output += seqStart * inputDim * contextLength;
-  input += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        // i + contextStart;
-        if ((i + contextStart) < 0) {
-          if (padding) {
-            value = weightData[i * inputDim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          if (padding) {
-            value =
-              weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
-                         inputDim + idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          output + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_forward(real* input,
-                                   const int* sequence,
-                                   real* weightData,
-                                   real* output,
-                                   int numSequences,
-                                   int inputDim,
-                                   int contextLength,
-                                   int contextStart,
-                                   int beginPad,
-                                   bool isPadding) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK(!isPadding || weightData);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  if (isPadding) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-
-__global__ void KeContextProjectionBackwardData(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  outputGrad += seqStart * inputDim * contextLength;
-  inputGrad += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + contextStart) < 0) {
-          continue;
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          continue;
-        } else {
-          // value = 0;
-          value = inputGrad[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          outputGrad + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-        inputGrad[(i + contextStart) * inputDim + idx] = value;
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_backward_data(real* outputGrad,
-                                         const int* sequence,
-                                         real* inputGrad,
-                                         int numSequences,
-                                         int inputDim,
-                                         int contextLength,
-                                         int contextStart) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(inputGrad);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-
-template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / padOfBlock;
-  int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-
-  sum_s[idy][idx] = 0.0f;
-  if (weightIdx < weightDim) {
-    for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
-      int seqStart = sequence[seqId];
-      int seqEnd = sequence[seqId+1];
-      output_r = outputGrad + seqStart * weightDim * contextLength;
-
-      if (contextStart < 0) {
-        if (padId + contextStart < 0) {
-          instanceId = padId;
-        } else {
-          // beginPad > 0;
-          instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
-        }
-      } else {
-        if (padId + (seqEnd - seqStart) < contextStart) {
-          continue;
-        } else {
-          // beginPad == 0;
-          instanceId = padId + (seqEnd - seqStart) - contextStart;
-        }
-      }
-
-      int outx = (instanceId - contextLength) < 0 ?
-                 instanceId : (contextLength - 1);
-      int outy = (instanceId - contextLength) < 0 ?
-                 0 : (instanceId - (contextLength - 1));
-      output_r += outy * weightDim * contextLength + outx * weightDim;
-      for (int j = outy; j < seqEnd - seqStart; j++) {
-        value += output_r[weightIdx];
-        if (j - outy == outx) break;
-        output_r += (contextLength - 1) * weightDim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (weightIdx < weightDim) {
-    if (idy == 0) {
-      weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
-    }
-  }
-}
-
-void hl_context_projection_backward_weight(real* outputGrad,
-                                           const int* sequence,
-                                           real* weightGrad,
-                                           int numSequences,
-                                           int weightDim,
-                                           int totalPad,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(weightGrad);
-
-  int threadsX = 32;
-  int threadsY = 32;
-  int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
-  dim3 threads(threadsX, threadsY);
-  dim3 grid(blocksX, 1);
-
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, weightGrad, numSequences, weightDim,
-     contextLength, contextStart, beginPad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
-
 template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                 real* table,
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
deleted file mode 100644
index ecc03a729dde2f2b4f8f004234a47d9272997a50..0000000000000000000000000000000000000000
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ /dev/null
@@ -1,200 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_USE_DSO
-
-#include <cuda_runtime.h>
-#include <mutex>
-#include "hl_dso_loader.h"
-
-/**
- * cudart wrapper: for dynamic load libcudart.so.
- * When nvcc compile cuda kernels, it will insert
- * some build-in runtime routines, which must be
- * provided by us if PADDLE_USE_DSO is true. If
- * PADDLE_USE_DSO is false, all of them must be
- * ignored to avoid multiple definitions.
- */
-namespace dynload {
-
-extern std::once_flag cudart_dso_flag;
-extern void *cudart_dso_handle;
-
-/**
- * The following macro definition can generate structs
- * (for each function) to dynamic load cuda routine
- * via operator overloading.
- **/
-#define DYNAMIC_LOAD_CUDART_WRAP(__name, __type)                               \
-  struct DynLoad__##__name {                                                   \
-    template <typename... Args>                                                \
-    __type operator()(Args... args) {                                          \
-      typedef __type (*cudartFunc)(Args...);                                   \
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle, &cudart_dso_handle); \
-      void *p_##__name = dlsym(cudart_dso_handle, #__name);                    \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);                \
-    }                                                                          \
-  } __name; /* struct DynLoad__##__name */
-
-/* include all needed cuda functions in HPPL */
-// clang-format off
-#define CUDA_ROUTINE_EACH(__macro)          \
-  __macro(cudaLaunch, cudaError_t)          \
-  __macro(cudaSetupArgument, cudaError_t)   \
-  __macro(cudaConfigureCall, cudaError_t)   \
-  __macro(__cudaRegisterFatBinary, void**)  \
-  __macro(__cudaUnregisterFatBinary, void)  \
-  __macro(__cudaRegisterFunction, void)     \
-  __macro(__cudaRegisterVar, void)          \
-  __macro(__cudaRegisterManagedVar, void)   \
-  __macro(__cudaInitModule, char)           \
-  __macro(__cudaRegisterTexture, void)      \
-  __macro(__cudaRegisterSurface, void)
-// clang-format on
-
-CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
-
-#if CUDART_VERSION >= 7000
-DYNAMIC_LOAD_CUDART_WRAP(cudaLaunchKernel, cudaError_t)
-#endif
-
-#undef CUDA_ROUNTINE_EACH
-
-} /* namespace dynload */
-
-#if CUDART_VERSION >= 7000
-__host__ cudaError_t CUDARTAPI cudaLaunchKernel(const void *func,
-                                                dim3 gridDim,
-                                                dim3 blockDim,
-                                                void **args,
-                                                size_t sharedMem,
-                                                cudaStream_t stream) {
-  return dynload::cudaLaunchKernel(
-      func, gridDim, blockDim, args, sharedMem, stream);
-}
-#endif /* CUDART_VERSION >= 7000 */
-
-__host__ cudaError_t CUDARTAPI cudaLaunch(const void *func) {
-  return dynload::cudaLaunch(func);
-}
-
-__host__ cudaError_t CUDARTAPI cudaSetupArgument(const void *arg,
-                                                 size_t size,
-                                                 size_t offset) {
-  return dynload::cudaSetupArgument(arg, size, offset);
-}
-
-__host__ cudaError_t CUDARTAPI cudaConfigureCall(dim3 gridDim,
-                                                 dim3 blockDim,
-                                                 size_t sharedMem,
-                                                 cudaStream_t stream) {
-  return dynload::cudaConfigureCall(gridDim, blockDim, sharedMem, stream);
-}
-
-extern "C" {
-
-void **CUDARTAPI __cudaRegisterFatBinary(void *fatCubin) {
-  return dynload::__cudaRegisterFatBinary(fatCubin);
-}
-
-void CUDARTAPI __cudaUnregisterFatBinary(void **fatCubinHandle) {
-  return dynload::__cudaUnregisterFatBinary(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterFunction(void **fatCubinHandle,
-                                      const char *hostFun,
-                                      char *deviceFun,
-                                      const char *deviceName,
-                                      int thread_limit,
-                                      uint3 *tid,
-                                      uint3 *bid,
-                                      dim3 *bDim,
-                                      dim3 *gDim,
-                                      int *wSize) {
-  return dynload::__cudaRegisterFunction(fatCubinHandle,
-                                         hostFun,
-                                         deviceFun,
-                                         deviceName,
-                                         thread_limit,
-                                         tid,
-                                         bid,
-                                         bDim,
-                                         gDim,
-                                         wSize);
-}
-
-void CUDARTAPI __cudaRegisterVar(void **fatCubinHandle,
-                                 char *hostVar,
-                                 char *deviceAddress,
-                                 const char *deviceName,
-                                 int ext,
-                                 int size,
-                                 int constant,
-                                 int global) {
-  return dynload::__cudaRegisterVar(fatCubinHandle,
-                                    hostVar,
-                                    deviceAddress,
-                                    deviceName,
-                                    ext,
-                                    size,
-                                    constant,
-                                    global);
-}
-
-extern void CUDARTAPI __cudaRegisterManagedVar(void **fatCubinHandle,
-                                               void **hostVarPtrAddress,
-                                               char *deviceAddress,
-                                               const char *deviceName,
-                                               int ext,
-                                               int size,
-                                               int constant,
-                                               int global) {
-  return dynload::__cudaRegisterManagedVar(fatCubinHandle,
-                                           hostVarPtrAddress,
-                                           deviceAddress,
-                                           deviceName,
-                                           ext,
-                                           size,
-                                           constant,
-                                           global);
-}
-
-char CUDARTAPI __cudaInitModule(void **fatCubinHandle) {
-  return dynload::__cudaInitModule(fatCubinHandle);
-}
-
-void CUDARTAPI __cudaRegisterTexture(void **fatCubinHandle,
-                                     const struct textureReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int norm,
-                                     int ext) {
-  return dynload::__cudaRegisterTexture(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, norm, ext);
-}
-
-void CUDARTAPI __cudaRegisterSurface(void **fatCubinHandle,
-                                     const struct surfaceReference *hostVar,
-                                     const void **deviceAddress,
-                                     const char *deviceName,
-                                     int dim,
-                                     int ext) {
-  return dynload::__cudaRegisterSurface(
-      fatCubinHandle, hostVar, deviceAddress, deviceName, dim, ext);
-}
-
-} /* extern "C" */
-
-#endif
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index c92909de534a875028d6d4784b02f08648c85a9a..53164dd27c7c5f5254e743b6fcf1d7b6fc895e31 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -25,10 +25,8 @@ DEFINE_string(cudnn_dir,
 DEFINE_string(cuda_dir,
               "",
               "Specify path for loading cuda library, such as libcublas, "
-              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-              "libcudart can not be specified by cuda_dir, since some "
-              "build-in function in cudart already ran before main entry). "
-              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+              "libcurand. For instance, /usr/local/cuda/lib64. If default, "
+              "dlopen will search cuda from LD_LIBRARY_PATH");
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
@@ -147,14 +145,6 @@ void GetCudnnDsoHandle(void** dso_handle) {
 #endif
 }
 
-void GetCudartDsoHandle(void** dso_handle) {
-#if defined(__APPLE__) || defined(__OSX__)
-  GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
-#else
-  GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
-#endif
-}
-
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
   GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index f0ef0cc3c51f9e7935dc3c40f630e4d70960802a..4f0bbfcf4e3aa51dd06acf254af65c62098a1df7 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
   CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
 
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template<int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+                                                int * topIds,
+                                                real* src, int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+        if (*--topIds == label[blockIdx.x]) {
+            recResult[blockIdx.x] = 0;
+            break;
+        }
+        recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal, int ldv,
+                                   int* topIds,
+                                   real* src, int lds,
+                                   int dim,
+                                   int topkSize,
+                                   int numSamples,
+                                   int* label,
+                                   real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256>
+  <<< grid, threads, 0, STREAM_DEFAULT >>>
+  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 9ae8bc0f220e143a5c59d8c3ead012a20369e7b9..f57efb2b46797c303d99a5468ad96163a3e74972 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -29,7 +29,6 @@ void* warpctc_dso_handle = nullptr;
  * false, you need to add the path of libwarp-ctc.so to
  * the linked-libs of paddle or to LD_PRELOAD.
  */
-#ifdef PADDLE_USE_DSO
 #define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
   struct DynLoad__##__name {                                           \
     template <typename... Args>                                        \
@@ -41,15 +40,6 @@ void* warpctc_dso_handle = nullptr;
       return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
     }                                                                  \
   } __name;  // struct DynLoad__##__name
-#else
-#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
-  struct DynLoad__##__name {                                     \
-    template <typename... Args>                                  \
-    auto operator()(Args... args) -> decltype(__name(args...)) { \
-      return __name(args...);                                    \
-    }                                                            \
-  } __name;  // struct DynLoad__##__name
-#endif
 
 // include all needed warp-ctc functions
 DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
@@ -64,22 +54,26 @@ DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
 #define WARPCTC_GET_VERSION dynload::get_warpctc_version
 #define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
 
+static int g_warpctcVersion = -1;
 #ifndef PADDLE_TYPE_DOUBLE
 #define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
 #define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
 #else
-#define WARPCTC_LOG_FATAL                                \
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
-             << "] Error: not support double precision."
-#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
-#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+hl_warpctc_status_t fatal(...) {
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
+             << "] Error: not support double precision.";
+  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
+  // type value
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+#define WARPCTC_COMPUTE_LOSS fatal
+#define WARPCTC_GET_WORKSPACE_SIZE fatal
 #endif
 
 /**
  * Check build-in warp-ctc function using glog and it also
  * support << operator for more details error info.
  */
-static int g_warpctcVersion = -1;
 #define CHECK_WARPCTC(warpctcStat)                \
   CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
       << "warp-ctc [version " << g_warpctcVersion \
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2b70036e3ff7de9e8786bade03e220a4916db4c2
--- /dev/null
+++ b/paddle/function/BufferArg.cpp
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <glog/logging.h>
+
+#include "BufferArg.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+const SequenceArg& BufferArg::sequence() const {
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  return dynamic_cast<const SequenceArg&>(*this);
+}
+
+const SparseMatrixArg& BufferArg::sparse() const {
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  return dynamic_cast<const SparseMatrixArg&>(*this);
+}
+
+SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
+    : BufferArg(sparse, argType),
+      row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32),
+      nnz_(sparse.getElementCnt()),
+      format_(static_cast<SparseDataFormat>(sparse.getFormat())),
+      type_(static_cast<SparseDataType>(sparse.getValueType())) {
+  bufferType_ = TENSOR_SPARSE;
+}
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
new file mode 100644
index 0000000000000000000000000000000000000000..0dc7792f646457c22ee4791f18814afaa3809f7b
--- /dev/null
+++ b/paddle/function/BufferArg.h
@@ -0,0 +1,364 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+#include "TensorShape.h"
+#include "TensorType.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum BufferType {
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
+};
+
+class BufferArg;
+class SequenceArg;
+class SparseMatrixArg;
+
+/**
+ * \brief BufferArg used as the argument type of Function.
+ *
+ * The arguments of the Paddle Function have four Buffer types.
+ * 1. BufferArg for a dense Buffer of any dimension.
+ * 2. SequenceIdArg for a Buffer of sequence start positions.
+ * 3. SequenceArg for a Buffer of sequence data.
+ * 4. SparseMatrixArg for a Buffer of sparse matrix.
+ *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
+ * There is an ArgType property for the BufferArg used as Function Output.
+ * Whether the result of the Function calculation is assigned to the
+ * output Buffer or added to the output Buffer is determined by the
+ * argType_ property of the output BufferArg.
+ */
+
+// ArgType is only used by output BufferArg.
+// For input argument, argType_ is ignored.
+// For output argument, need to set the argType_ of the BufferArg.
+enum ArgType {
+  UNSPECIFIED = 0,
+  ASSIGN_TO = 1,
+  ADD_TO = 2,
+};
+class BufferArg {
+public:
+  void setArgType(ArgType argType) { argType_ = argType; }
+
+  ArgType getArgType() const { return argType_; }
+
+public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf,
+            ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(buf), valueType_(valueType), shape_(shape), argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(void* buf, ValueType valueType) : buf_(buf), valueType_(valueType) {
+    bufferType_ = TENSOR_NORMAL;
+  }
+
+  BufferArg(const Matrix& matrix, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(2),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, matrix.getHeight());
+    shape_.setDim(1, matrix.getWidth());
+  }
+
+  BufferArg(const Matrix& matrix,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(matrix.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(shape),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    CHECK_EQ(matrix.getElementCnt(), shape.getElements());
+  }
+
+  BufferArg(const Vector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(DataType<real>::value),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  BufferArg(const IVector& vector, ArgType argType = UNSPECIFIED)
+      : buf_(
+            const_cast<void*>(reinterpret_cast<const void*>(vector.getData()))),
+        valueType_(VALUE_TYPE_INT32),
+        shape_(1),
+        argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
+    shape_.setDim(0, vector.getSize());
+  }
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::Matrix matrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)2, shape_.ndims());
+    return typename Tensor<real, DType>::Matrix(
+        reinterpret_cast<real*>(buf_), shape_[0], shape_[1]);
+  }
+
+  template <typename VType, DeviceType DType>
+  typename Tensor<VType, DType>::Vector vector() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<VType>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ((size_t)1, shape_.ndims());
+    return typename Tensor<VType, DType>::Vector(
+        shape_[0], reinterpret_cast<VType*>(buf_));
+  }
+
+  virtual ~BufferArg() {}
+
+  template <typename T>
+  T* data() const {
+    return reinterpret_cast<T*>(buf_);
+  }
+
+  void* data() const { return buf_; }
+  ValueType valueType() const { return valueType_; }
+  BufferType bufferType() const { return bufferType_; }
+  const TensorShape& shape() const { return shape_; }
+  bool isSparseArg() const { return TENSOR_SPARSE == bufferType_; }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
+  virtual size_t numElements() const { return shape_.getElements(); }
+
+  const SequenceArg& sequence() const;
+  const SparseMatrixArg& sparse() const;
+
+protected:
+  void* buf_;
+  ValueType valueType_;
+  TensorShape shape_;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
+  // TODO(tianbing), add deviceType_
+  // leading dimensions. The size is dims_.size()
+  // Dims lds_;
+};
+
+// sequence start positions in a mini-batch of sequences
+// shape_.ndims() == 1
+// valueType_ = int32
+// if a < b then value_.buf_[a] < value_.buf_[b]
+class SequenceIdArg : public BufferArg {
+public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    CHECK_GE(shape_[0], 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(void* buf,
+                const TensorShape& shape,
+                ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    CHECK_EQ(shape_.ndims(), 1UL);
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
+    numSeqs_ = shape_[0] - 1;
+  }
+
+  ~SequenceIdArg() {}
+
+  size_t numSeqs() const { return numSeqs_; }
+
+private:
+  size_t numSeqs_;
+};
+
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
+class SequenceArg : public BufferArg {
+public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        startPositions_(TensorShape({shape[0]})) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(void* buf,
+              ValueType valueType,
+              const TensorShape& shape,
+              const SequenceIdArg& startPositions,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  SequenceArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
+
+  ~SequenceArg() {}
+
+  void* getIdBuf() const { return startPositions_.data(); }
+  size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
+
+private:
+  SequenceIdArg startPositions_;
+};
+
+// sparse matrix
+// valueType_ == float or double
+// shape_.ndims() == 2
+class SparseMatrixArg : public BufferArg {
+public:
+  SparseMatrixArg(void* buf,
+                  ValueType valueType,
+                  const TensorShape& shape,
+                  const BufferArg& row,
+                  const BufferArg& col,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(buf, valueType, shape, argType),
+        row_(row),
+        col_(col),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+    CHECK_EQ(row_.shape().ndims(), 1UL);
+    CHECK_EQ(col_.shape().ndims(), 1UL);
+    if (format_ == T_SPARSE_CSR) {
+      CHECK_EQ(nnz, col.shape()[0]);
+    } else if (format_ == T_SPARSE_CSC) {
+      CHECK_EQ(nnz, row.shape()[0]);
+    }
+  }
+
+  SparseMatrixArg(ValueType valueType,
+                  const TensorShape& shape,
+                  size_t nnz,
+                  SparseFormat format,
+                  SparseValueType type,
+                  ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType),
+        row_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        col_(BufferArg(nullptr, VALUE_TYPE_INT32)),
+        nnz_(nnz),
+        format_(static_cast<SparseDataFormat>(format)),
+        type_(static_cast<SparseDataType>(type)) {
+    bufferType_ = TENSOR_SPARSE;
+    CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
+    CHECK_EQ(shape_.ndims(), 2UL);
+
+    /// len of row_ : height + 1 (CSR) or nnz (CSC), buf_ == nullptr
+    row_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[0] + 1})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{nnz}));
+    /// len of col_ :  width + 1 (CSC) or nnz (CSR), buf_ == nullptr
+    col_ = (format_ == T_SPARSE_CSR
+                ? BufferArg(VALUE_TYPE_INT32, TensorShape{nnz})
+                : BufferArg(VALUE_TYPE_INT32, TensorShape{shape_[1] + 1}));
+  }
+
+  SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType = UNSPECIFIED);
+
+  template <DeviceType DType>
+  typename Tensor<real, DType>::SparseMatrix SparseMatrix() const {
+    CHECK(buf_);
+    CHECK(valueType_ == DataType<real>::value);
+    // CHECK(deviceType_ == DType);
+    CHECK_EQ(2UL, shape_.ndims());
+    return typename Tensor<real, DType>::SparseMatrix(
+        reinterpret_cast<real*>(buf_),
+        reinterpret_cast<int*>(row_.data()),
+        reinterpret_cast<int*>(col_.data()),
+        shape_[0],
+        shape_[1],
+        nnz_,
+        static_cast<SparseValueType>(type_),
+        static_cast<SparseFormat>(format_),
+        false);
+  }
+
+  ~SparseMatrixArg() {}
+
+  void* getRowBuf() const { return row_.data(); }
+
+  void* getColBuf() const { return col_.data(); }
+
+  size_t nnz() const { return nnz_; }
+
+  size_t numElements() const override { return nnz_; }
+
+  SparseDataFormat dataFormat() const { return format_; }
+
+  SparseDataType dataType() const { return type_; }
+
+private:
+  BufferArg row_;
+  BufferArg col_;
+  size_t nnz_;
+  SparseDataFormat format_;
+  SparseDataType type_;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1744f377808f137dcda4a28acce336dc22be3d01
--- /dev/null
+++ b/paddle/function/BufferArgTest.cpp
@@ -0,0 +1,38 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "BufferArg.h"
+#include <gtest/gtest.h>
+#include "paddle/math/MemoryHandle.h"
+
+namespace paddle {
+
+TEST(BufferTest, BufferArg) {
+  TensorShape shape({8, 10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_FLOAT));
+  BufferArg buffer(memory.getBuf(), VALUE_TYPE_FLOAT, shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+}
+
+TEST(BufferTest, SequenceIdArg) {
+  TensorShape shape({10});
+  CpuMemoryHandle memory(shape.getElements() *
+                         sizeOfValuType(VALUE_TYPE_INT32));
+  SequenceIdArg buffer(memory.getBuf(), shape);
+  EXPECT_EQ(buffer.data(), memory.getBuf());
+  EXPECT_EQ(buffer.numSeqs(), 9);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 0697842bbef620b0b536b742d06db23e00a78eec..1522510e8bb9816cb468fcf406e22560163950cc 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -1,23 +1,34 @@
-file(GLOB h_files . *_op.h)
-file(GLOB cpp_files . *_op.cpp)
+file(GLOB h_files . *Op.h)
+file(GLOB cpp_files . *Op.cpp)
 
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
+list(APPEND cpp_files BufferArg.cpp)
 
 if(WITH_GPU)
-    file(GLOB cu_files . *_op_gpu.cu)
+    file(GLOB cu_files . *OpGpu.cu)
     cuda_compile(cu_objs ${cu_files})
 endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
+add_dependencies(paddle_function ${external_project_dependencies})
 
-add_library(paddle_test_main STATIC TestMain.cpp)
 
 if(WITH_GPU)
+if(WITH_TESTING)
     # TODO:
-    # file(GLOB test_files . *_op_test.cpp)
+    # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(cross_map_normal_op_test)
+    add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(TensorShapeTest)
+    add_simple_unittest(TensorTypeTest)
+    add_simple_unittest(BufferArgTest)
+    add_simple_unittest(FunctionTest)
+    add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(PadOpTest)
+    add_simple_unittest(MulOpTest)
+    add_simple_unittest(CosSimOpTest)
+endif()
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b87750b74247bd0eb822340bc5a85d41b86ecec2
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -0,0 +1,412 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                               const CpuMatrix& input_mat,
+                                               const CpuMatrix& weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i], pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat).subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (weight_mat) {
+          MatrixPtr sub =
+              const_cast<CpuMatrix&>(weight_mat)
+                  .subMatrix(begin_pad + context_start + j - pad_size,
+                             pad_size);
+          mat->addAtOffset(*sub, j * input_mat.getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src =
+          const_cast<CpuMatrix&>(input_mat).subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * input_mat.getWidth());
+    }
+  }
+}
+
+/**
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(1UL == inputs.size() || 2UL == inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(val_seqs.shape().ndims(), 2UL);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    if (2UL == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+      /// dim of input == dim of weight
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
+
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2UL == inputs.size() && inputs[1].data())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
+
+    ContextProjectionForward<Device>(out_mat,
+                                     in_mat,
+                                     w_mat,
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
+                                                CpuMatrix& in_grad_mat,
+                                                CpuMatrix& w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  size_t input_dim = in_grad_mat ? in_grad_mat.getWidth()
+                                 : w_grad_mat ? w_grad_mat.getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat.subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK(1UL == outputs.size() || 2UL == outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), 1UL);
+
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+
+    if (2UL == outputs.size()) {
+      CHECK_EQ(outputs[1].shape().ndims(), 2UL);
+      /// dim of input grad == dim of weight
+      CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+      CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    }
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto in_grad_mat =
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat =
+        (2UL == outputs.size() && outputs[1].data())
+            ? outputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+
+    ContextProjectionBackward<Device>(out_grad_mat,
+                                      in_grad_mat,
+                                      w_grad_mat,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+/**
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
+
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.shape().ndims(), 2UL);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), 1UL);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..6f7d936379a5378e6fd85dd86618d1b6094bd14f
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.h
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[in/out]  outputs           output data.
+ * \param[in]      input             input data.
+ * \param[in]      weight            input weight.
+ * \param[in]      sequence          input data.
+ * \param[in]      context_length    consecutive rows for concatenation.
+ * \param[in]      context_start     context start position.
+ * \param[in]      begin_pad         begining pad position.
+ * \param[in]      is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType DType>
+void ContextProjectionForward(
+    typename Tensor<real, DType>::Matrix& output,
+    const typename Tensor<real, DType>::Matrix& input,
+    const typename Tensor<real, DType>::Matrix& weight,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad);
+
+/**
+ * \brief   Context Projection Backward.
+ *
+ * \param[out]  outputs           output gradient.
+ * \param[in]   input             input gradient.
+ * \param[in]   weight            input weight gradient.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType DType>
+void ContextProjectionBackward(
+    const typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t begin_pad,
+    bool is_padding,
+    size_t total_pad);
+
+template <DeviceType DType>
+void ContextProjectionBackwardData(
+    const typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& in_grad,
+    const typename Tensor<int, DType>::Vector& sequence,
+    size_t context_length,
+    int context_start);
+
+template <DeviceType DType>
+void ContextProjectionBackwardWeight(
+    const typename Tensor<real, DType>::Matrix& out_grad,
+    typename Tensor<real, DType>::Matrix& w_grad,
+    const typename Tensor<int, DType>::Vector& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t total_pad,
+    size_t begin_pad);
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1a5b4042402df3081a493962a5e080d72b7f40b2
--- /dev/null
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -0,0 +1,399 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "ContextProjectionOp.h"
+
+namespace paddle {
+
+template <bool padding>
+__global__ void KeContextProjectionForward(const real* input,
+                                           const int* sequence,
+                                           const real* weight,
+                                           real* output,
+                                           int input_dim,
+                                           int context_length,
+                                           int context_start,
+                                           int begin_pad) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  output += seq_start * input_dim * context_length;
+  input += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        // i + context_start;
+        if ((i + context_start) < 0) {
+          if (padding) {
+            value = weight[i * input_dim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          if (padding) {
+            value =
+              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                         input_dim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          output + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection forward.
+ *
+ * @param[in]   input           input sequence.
+ * @param[in]   sequence        sequence index.
+ * @param[in]   weight          padding data.
+ * @param[out]  output          output sequence.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_forward(const real* input,
+                                   const int* sequence,
+                                   const real* weight,
+                                   real* output,
+                                   size_t num_sequences,
+                                   size_t input_dim,
+                                   size_t context_length,
+                                   int context_start,
+                                   size_t begin_pad) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+
+  if (weight) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
+                                               const GpuMatrix& input,
+                                               const GpuMatrix& weight,
+                                               const GpuIVector& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  hl_context_projection_forward(input.getData(),
+                                sequence.getData(),
+                                weight ? weight.getData() : nullptr,
+                                output.getData(),
+                                sequence.getSize() - 1,
+                                input.getWidth(),
+                                context_length,
+                                context_start,
+                                begin_pad);
+}
+
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
+                                                const int* sequence,
+                                                real* in_grad,
+                                                size_t input_dim,
+                                                int context_length,
+                                                int context_start) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
+  in_grad += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        if ((i + context_start) < 0) {
+          continue;
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          continue;
+        } else {
+          // value = 0;
+          value = in_grad[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          out + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          value += output_r[idx];
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+        in_grad[(i + context_start) * input_dim + idx] = value;
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection backward data.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  input_grad       input gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ *
+ */
+void hl_context_projection_backward_data(const real* out_grad,
+                                         const int* sequence,
+                                         real* input_grad,
+                                         size_t num_sequences,
+                                         size_t input_dim,
+                                         size_t context_length,
+                                         int context_start) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(input_grad);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  CHECK_SYNC("hl_context_projection_backward_data failed");
+}
+
+template <>
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                                    GpuMatrix& in_grad,
+                                                    const GpuIVector& sequence,
+                                                    size_t context_length,
+                                                    int context_start) {
+  hl_context_projection_backward_data(out_grad.getData(),
+                                      sequence.getData(),
+                                      in_grad.getData(),
+                                      sequence.getSize() - 1,
+                                      in_grad.getWidth(),
+                                      context_length,
+                                      context_start);
+}
+
+template<int THREADS_X, int THREADS_Y>
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
+                                                  const int* sequence,
+                                                  real* w_grad,
+                                                  int num_sequences,
+                                                  int w_dim,
+                                                  int context_length,
+                                                  int context_start,
+                                                  int begin_pad) {
+  __shared__ real sum_s[THREADS_Y][THREADS_X];
+  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int padId = blockIdx.x / pad_of_block;
+  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
+  int instanceId;
+  real value = 0;
+  real* output_r;
+
+  sum_s[idy][idx] = 0.0f;
+  if (weight_idx < w_dim) {
+    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
+      int seq_start = sequence[seqId];
+      int seq_end = sequence[seqId+1];
+      output_r = const_cast<real*>(out_grad)
+                    + seq_start * w_dim * context_length;
+
+      if (context_start < 0) {
+        if (padId + context_start < 0) {
+          instanceId = padId;
+        } else {
+          // begin_pad > 0;
+          instanceId = (padId - begin_pad) +
+            (seq_end - seq_start) - context_start;
+        }
+      } else {
+        if (padId + (seq_end - seq_start) < context_start) {
+          continue;
+        } else {
+          // begin_pad == 0;
+          instanceId = padId + (seq_end - seq_start) - context_start;
+        }
+      }
+
+      int outx = (instanceId - context_length) < 0 ?
+                 instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0 ?
+                 0 : (instanceId - (context_length - 1));
+      output_r += outy * w_dim * context_length + outx * w_dim;
+      for (int j = outy; j < seq_end - seq_start; j++) {
+        value += output_r[weight_idx];
+        if (j - outy == outx) break;
+        output_r += (context_length - 1) * w_dim;
+      }
+    }
+    sum_s[idy][idx] = value;
+  }
+  __syncthreads();
+
+  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+    if (idy < stride) {
+      sum_s[idy][idx] += sum_s[idy + stride][idx];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (weight_idx < w_dim) {
+    if (idy == 0) {
+      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
+    }
+  }
+}
+
+/**
+ * @brief   Context projection backward weight.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  w_grad           weight gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   w_dim            input sequence dimension.
+ * @param[in]   total_pad        number of extra timesteps.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_backward_weight(const real* out_grad,
+                                           const int* sequence,
+                                           real* w_grad,
+                                           size_t num_sequences,
+                                           size_t w_dim,
+                                           size_t total_pad,
+                                           size_t context_length,
+                                           int context_start,
+                                           size_t begin_pad) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(w_grad);
+
+  int threads_x = 32;
+  int threads_y = 32;
+  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
+  dim3 threads(threads_x, threads_y);
+  dim3 grid(blocks_x, 1);
+
+  KeContextProjectionBackwardWeight<32, 32>
+    <<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, w_grad, num_sequences, w_dim,
+     context_length, context_start, begin_pad);
+  CHECK_SYNC("hl_context_projection_backward_weight failed");
+}
+
+template <>
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+        const GpuMatrix& out_grad,
+        GpuMatrix& w_grad,
+        const GpuIVector& seq_vec,
+        size_t context_length,
+        int context_start,
+        size_t total_pad,
+        size_t begin_pad) {
+  hl_context_projection_backward_weight(out_grad.getData(),
+                                        seq_vec.getData(),
+                                        w_grad.getData(),
+                                        seq_vec.getSize() - 1,
+                                        w_grad.getWidth(),
+                                        total_pad,
+                                        context_length,
+                                        context_start,
+                                        begin_pad);
+}
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                                GpuMatrix& in_grad,
+                                                GpuMatrix& w_grad,
+                                                const GpuIVector& sequence,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+    if (in_grad) {
+        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+                out_grad,
+                in_grad,
+                sequence,
+                context_length,
+                context_start);
+    }
+    if (is_padding && w_grad) {
+        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                out_grad,
+                w_grad,
+                sequence,
+                context_length,
+                context_start,
+                total_pad,
+                begin_pad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0f5d6a848d406d14984a0b6edad8192dab42e88b
--- /dev/null
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare test("ContextProjectionForward",
+                       FuncConfig()
+                           .set("context_length", context_length)
+                           .set("context_start", context_start)
+                           .set("begin_pad", std::max(0, -context_start)));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}));
+  if (is_padding) {  // weight
+    test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}));
+  }
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT,
+                  TensorShape{batch_size, input_dim * context_length}),
+      ADD_TO);
+
+  // run Function
+  test.run();
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  int context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare test("ContextProjectionBackward",
+                       FuncConfig()
+                           .set("context_length", context_length)
+                           .set("context_start", context_start)
+                           .set("begin_pad", std::max(0, -context_start))
+                           .set("is_padding", is_padding)
+                           .set("total_pad", pad));
+
+  // prepare input arguments
+  test.addSequence(SequenceIdArg(TensorShape{batch_size}));
+  test.addInputs(SequenceArg(
+      VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim * context_length}));
+  test.addOutputs(
+      SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batch_size, input_dim}),
+      ADD_TO);
+  if (is_padding) {  // weight
+    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{pad, input_dim}),
+                    ADD_TO);
+  }
+
+  // run Function
+  test.run();
+}
+
+TEST(ContextProjection, Projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/CosSimOp.cpp b/paddle/function/CosSimOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ece7b2dfedaf460741c97b5a700eb632d85cabc
--- /dev/null
+++ b/paddle/function/CosSimOp.cpp
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CosSimOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+/**
+ * Cosine Similarity for CpuMatrix
+ *
+ * \param out_mat, output value, size: nSamples * 1.
+ * \param in1_mat, input value 1, size: nSamples * dim.
+ * \param in2_mat, input value 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale, default 1.0
+ *
+ */
+template <>
+void CosSimForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
+                                    const CpuMatrix& in1_mat,
+                                    const CpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  size_t num_samples = out_mat.getHeight();
+  size_t dim = in1_mat.getWidth();
+  /// column vector [nSamples, 1]
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+
+  /// in2 might only have one row or full rows
+  CHECK(in2_mat.getHeight() == 1LU || in2_mat.getHeight() == num_samples);
+  size_t inc = (in2_mat.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i, x += dim, y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += x[j] * x[j];
+      square_sum_y += y[j] * y[j];
+      xy += x[j] * y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    out[i] = scale * xy / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+  }
+}
+
+/**
+ * Cosine Similarity
+ * for each row i,
+ *   out[i] = scale * cos(input1[i], input2[i])
+ *      = scale * <input1[i], input2[i]>/sqrt(|input1[i]|^2 * |input2[i]|^2)
+ * when input2 only has one row, then for each row i,
+ *   out[i] = cos(input1[i], input2[0])
+ *
+ * \param inputs[0] input matrix 1, size: nSamples * dim.
+ * \param inputs[1] input matrix 2, size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param outputs[0] output matrix, size : nSamples * 1.
+ */
+
+template <DeviceType Device>
+class CosSimForwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 2UL);
+    CHECK_EQ(outputs.size(), 1UL);
+
+    CHECK_EQ(inputs[0].shape().ndims(), 2UL);
+    CHECK_EQ(inputs[1].shape().ndims(), 2UL);
+    CHECK_EQ(outputs[0].shape().ndims(), 2UL);
+
+    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], 1UL);
+
+    CHECK(outputs[0].data() && inputs[0].data() && inputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    auto out_mat = outputs[0].matrix<Device>();
+    const auto in1_mat = inputs[0].matrix<Device>();
+    const auto in2_mat = inputs[1].matrix<Device>();
+
+    CosSimForward<Device>(out_mat, in1_mat, in2_mat, scale_);
+  }
+
+private:
+  real scale_;
+};
+
+/**
+ * Cosine Similarity Derivative for CpuMatrix
+ *
+ * \param in1_grad  forward input grad 1, size: nSamples * dim.
+ * \param in2_grad  forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param out_grad  backward loss output grad, size : nSamples * 1.
+ * \param out_val   forward output value, size: nSamples * 1.
+ * \param in1_val   forward input value 1, size: nSamples * dim.
+ * \param in2_val   forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ * \param scale,    default 1.0
+ */
+template <>
+void CosSimBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad,
+                                     const CpuMatrix& out_val,
+                                     const CpuMatrix& in1_val,
+                                     const CpuMatrix& in2_val,
+                                     CpuMatrix& in1_grad,
+                                     CpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK_EQ(out_val.useGpu_, false) << "Matrix type are GPU, CPU required";
+
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+
+  size_t num_samples = out_grad.getHeight();
+  size_t dim = in1_val.getWidth();
+  CHECK_EQ(in2_val.getHeight(), in2_grad.getHeight());
+  CHECK(in2_val.getHeight() == 1LU || in2_val.getHeight() == num_samples);
+  size_t inc = (in2_val.getHeight() == 1LU) ? 0 : dim;
+  for (size_t i = 0; i < num_samples; ++i,
+              prev_out_x += dim,
+              prev_out_y += inc,
+              prev_grad_x += dim,
+              prev_grad_y += inc) {
+    real square_sum_x = 0;
+    real square_sum_y = 0;
+    real xy = 0;
+    for (size_t j = 0; j < dim; ++j) {
+      square_sum_x += prev_out_x[j] * prev_out_x[j];
+      square_sum_y += prev_out_y[j] * prev_out_y[j];
+      xy += prev_out_x[j] * prev_out_y[j];
+    }
+    CHECK(square_sum_x > 0 && square_sum_y > 0);
+    if (xy == 0) {
+      real reciprocal =
+          1.0f / (std::sqrt(square_sum_x) * std::sqrt(square_sum_y));
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] += scale * grad[i] * prev_out_y[j] * reciprocal;
+        prev_grad_y[j] += scale * grad[i] * prev_out_x[j] * reciprocal;
+      }
+    } else {
+      real reciprocal_xy = 1.0f / xy;
+      real reciprocal_square_sum_x = 1.0f / square_sum_x;
+      real reciprocal_square_sum_y = 1.0f / square_sum_y;
+      for (size_t j = 0; j < dim; ++j) {
+        prev_grad_x[j] +=
+            out[i] * grad[i] * (prev_out_y[j] * reciprocal_xy -
+                                prev_out_x[j] * reciprocal_square_sum_x);
+        prev_grad_y[j] +=
+            out[i] * grad[i] * (prev_out_x[j] * reciprocal_xy -
+                                prev_out_y[j] * reciprocal_square_sum_y);
+      }
+    }
+  }
+}
+
+/**
+ * Cosine Similarity backward Derivative
+ *
+ * \param outputs[0] forward input grad 1, size: nSamples * dim.
+ * \param outputs[1] forward input grad 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ *
+ * \param inputs[0] backward loss output grad, size : nSamples * 1.
+ * \param inputs[1] forward output value, size: nSamples * 1.
+ * \param inputs[2] forward input value 1, size: nSamples * dim.
+ * \param inputs[3] forward input value 2,
+ *                  size: n2 * dim (n2 == 1 or n2 == nSamples).
+ */
+template <DeviceType Device>
+class CosSimBackwardFunc : public FunctionBase {
+  void init(const FuncConfig& config) override {
+    scale_ = config.get<real>("scale");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(inputs.size(), 4UL);
+    CHECK_EQ(outputs.size(), 2UL);
+    /// dim of out_grad and out_val == 1, column vector
+    CHECK_EQ(inputs[0].shape()[1], 1UL);
+    CHECK_EQ(inputs[1].shape()[1], 1UL);
+    /// nSamples of out_grad == out_val == in_val1 == in_grad1
+    CHECK_EQ(inputs[1].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(inputs[0].shape()[0], inputs[0].shape()[0]);
+    CHECK_EQ(outputs[0].shape()[0], inputs[0].shape()[0]);
+    /// dim of in1_val1 == in_val2 == in_grad1 == in_grad2
+    CHECK_EQ(inputs[3].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[0].shape()[1], inputs[2].shape()[1]);
+    CHECK_EQ(outputs[1].shape()[1], inputs[2].shape()[1]);
+
+    CHECK(inputs[0].data() && inputs[1].data() && inputs[2].data() &&
+          inputs[3].data() && outputs[0].data() && outputs[1].data());
+
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+
+    const auto out_grad = inputs[0].matrix<Device>();
+    const auto out_val = inputs[1].matrix<Device>();
+    const auto in1_val = inputs[2].matrix<Device>();
+    const auto in2_val = inputs[3].matrix<Device>();
+    auto in1_grad = outputs[0].matrix<Device>();
+    auto in2_grad = outputs[1].matrix<Device>();
+
+    CosSimBackward<Device>(
+        out_grad, out_val, in1_val, in2_val, in1_grad, in2_grad, scale_);
+  }
+
+private:
+  real scale_;
+};
+
+REGISTER_TYPED_FUNC(CosSimForward, CPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, CPU, CosSimBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(CosSimForward, GPU, CosSimForwardFunc);
+REGISTER_TYPED_FUNC(CosSimBackward, GPU, CosSimBackwardFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/CosSimOp.h b/paddle/function/CosSimOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..be73064e6375bf1e6c6a7ca6de52e9b9b755880b
--- /dev/null
+++ b/paddle/function/CosSimOp.h
@@ -0,0 +1,61 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Cosine Similarity Forward.
+ * for each row i,
+ * out[i] = scale * cos(in1[i], in2[i])
+ *        = scale * \sum_j (in1[i][j] * in2[i][j]) /
+ *                  sqrt(sum_j (in1[i][j]^2) * sum_j (in2[i][j])^2)
+ *
+ * \param[out]  output            output value.
+ * \param[in]   intput1           input value.
+ * \param[in]   intput2           input value.
+ * \param[in]   scale             default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimForward(typename Tensor<real, Device>::Matrix& output,
+                   const typename Tensor<real, Device>::Matrix& input1,
+                   const typename Tensor<real, Device>::Matrix& input2,
+                   real scale);
+
+/**
+ * \brief   Cosine Similarity BackWard for Derivative.
+ *
+ * \param[in]       output grad           backward loss output grad.
+ * \param[in]       output val            forward-output value.
+ * \param[in]       input val1            forward input value 1.
+ * \param[in]       input val2            forward input value 2.
+ * \param[in/out]   input grad            forward input grad 1.
+ * \param[in/out]   input grad            forward input grad 2.
+ * \param[in]       scale                 default 1.0.
+ *
+ */
+template <DeviceType Device>
+void CosSimBackward(const typename Tensor<real, Device>::Matrix& out_grad,
+                    const typename Tensor<real, Device>::Matrix& out_value,
+                    const typename Tensor<real, Device>::Matrix& in1_value,
+                    const typename Tensor<real, Device>::Matrix& in2_value,
+                    typename Tensor<real, Device>::Matrix& in1_grad,
+                    typename Tensor<real, Device>::Matrix& in2_grad,
+                    real scale);
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c62ab39551f02288618244871ae31c6800df5b42
--- /dev/null
+++ b/paddle/function/CosSimOpGpu.cu
@@ -0,0 +1,240 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "hl_device_functions.cuh"
+#include "CosSimOp.h"
+
+namespace paddle {
+
+template<int block_size>
+__global__ void KeCosSim(real* output,
+                         const real* input1,
+                         const real* input2,
+                         int width,
+                         int input1_height,
+                         int input2_height,
+                         real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  input1 += ty * width;
+  if (input2_height > 1) {
+    input2 += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = input1[index];
+    real y = input2[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
+  }
+}
+
+void hlCossim(real* output,
+              const real* input1,
+              const real* input2,
+              size_t width,
+              size_t input1_height,
+              size_t input2_height,
+              real scale) {
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(input1);
+  CHECK_NOTNULL(input2);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+
+  KeCosSim<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (output, input1, input2, width, input1_height, input2_height, scale);
+  CHECK_SYNC("hlCossim failed");
+}
+
+template <>
+void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
+                                    const GpuMatrix& in1_mat,
+                                    const GpuMatrix& in2_mat,
+                                    real scale) {
+  CHECK(out_mat.getData() && in1_mat.getData() && in2_mat.getData());
+  CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
+      << "Matrix type are not GPU";
+
+  size_t dim = in1_mat.getWidth();
+  real* out = out_mat.getData();
+  const real* x = in1_mat.getData();
+  const real* y = in2_mat.getData();
+  hlCossim(out, x, y, dim, in1_mat.getHeight(), in2_mat.getHeight(), scale);
+}
+
+template<int block_size>
+__global__ void KeCosSimDerivative(const real* grad,
+                                   const real* output,
+                                   const real* prev_out_x,
+                                   const real* prev_out_y,
+                                   real* prev_grad_x,
+                                   real* prev_grad_y,
+                                   size_t width,
+                                   size_t input1_height,
+                                   size_t input2_height,
+                                   real scale) {
+  const int ty = blockIdx.y;
+  int tid = threadIdx.x;
+
+  __shared__ real xx[block_size];
+  __shared__ real yy[block_size];
+  __shared__ real xy[block_size];
+
+  xx[tid] = 0.0;
+  yy[tid] = 0.0;
+  xy[tid] = 0.0;
+  __syncthreads();
+
+  prev_out_x += ty * width;
+  prev_grad_x += ty * width;
+  if (input2_height > 1) {
+    prev_out_y += ty * width;
+    prev_grad_y += ty * width;
+  }
+  for (int index = tid; index < width; index += block_size) {
+    real x = prev_out_x[index];
+    real y = prev_out_y[index];
+    xx[tid] += x * x;
+    yy[tid] += y * y;
+    xy[tid] += x * y;
+  }
+  __syncthreads();
+
+  for (int s = block_size / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      xx[tid] += xx[tid + s];
+      yy[tid] += yy[tid + s];
+      xy[tid] += xy[tid + s];
+    }
+    __syncthreads();
+  }
+  if (xy[0] == 0) {
+    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
+    for (int index = tid; index < width; index += block_size) {
+      prev_grad_x[index] +=
+        scale * grad[ty] * prev_out_y[index] * reciprocal;
+      if (input2_height > 1) {
+        prev_grad_y[index] +=
+          scale * grad[ty] * prev_out_x[index] * reciprocal;
+      } else {
+        paddle::paddleAtomicAdd(prev_grad_y + index,
+          scale * grad[ty] * prev_out_x[index] * reciprocal);
+      }
+    }
+  } else {
+    real reciprocalXY = 1.0 / xy[0];
+    real reciprocalSquareSumX = 1.0 / xx[0];
+    real reciprocalSquareSumY = 1.0 / yy[0];
+    for (int index = tid; index < width; index += block_size) {
+      prev_grad_x[index] += output[ty] * grad[ty] *
+        (prev_out_y[index] * reciprocalXY -
+         prev_out_x[index] * reciprocalSquareSumX);
+      if (input2_height > 1) {
+        prev_grad_y[index] += output[ty] * grad[ty] *
+          (prev_out_x[index] * reciprocalXY -
+           prev_out_y[index] * reciprocalSquareSumY);
+      } else {
+        paddle::paddleAtomicAdd(prev_grad_y + index, output[ty] * grad[ty] *
+          (prev_out_x[index] * reciprocalXY -
+           prev_out_y[index] * reciprocalSquareSumY));
+      }
+    }
+  }
+}
+
+void hlCossimDerivative(const real* grad,
+                        const real* output,
+                        const real* prev_out_x,
+                        const real* prev_out_y,
+                        real* prev_grad_x,
+                        real* prev_grad_y,
+                        size_t width,
+                        size_t input1_height,
+                        size_t input2_height,
+                        real scale) {
+  CHECK_NOTNULL(grad);
+  CHECK_NOTNULL(output);
+  CHECK_NOTNULL(prev_out_x);
+  CHECK_NOTNULL(prev_out_y);
+  CHECK_NOTNULL(prev_grad_x);
+  CHECK_NOTNULL(prev_grad_y);
+  const int block_size = 256;
+  dim3 threads(block_size, 1);
+  dim3 grid(1, input1_height);
+  KeCosSimDerivative<block_size><<<grid, threads, 0, STREAM_DEFAULT>>>
+    (grad, output, prev_out_x, prev_out_y, prev_grad_x, prev_grad_y, width,
+        input1_height, input2_height, scale);
+  CHECK_SYNC("hlCossimDerivate failed");
+}
+
+template <>
+void CosSimBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
+                                     const GpuMatrix& out_val,
+                                     const GpuMatrix& in1_val,
+                                     const GpuMatrix& in2_val,
+                                     GpuMatrix& in1_grad,
+                                     GpuMatrix& in2_grad,
+                                     real scale) {
+  CHECK(out_grad.getData() && out_val.getData() && in1_val.getData() &&
+        in2_val.getData() && in1_grad.getData() && in2_grad.getData());
+  CHECK(out_grad.useGpu_ && out_val.useGpu_ && in1_val.useGpu_
+        && in2_val.useGpu_ && in1_grad.useGpu_ && in2_grad.useGpu_)
+        << "Matrix types are not equally GPU";
+
+  size_t dim = in1_val.getWidth();
+  const real* grad = out_grad.getData();
+  const real* out = out_val.getData();
+  const real* prev_out_x = in1_val.getData();
+  const real* prev_out_y = in2_val.getData();
+  real* prev_grad_x = in1_grad.getData();
+  real* prev_grad_y = in2_grad.getData();
+  hlCossimDerivative(grad,
+                     out,
+                     prev_out_x,
+                     prev_out_y,
+                     prev_grad_x,
+                     prev_grad_y,
+                     dim,
+                     in1_val.getHeight(),
+                     in2_val.getHeight(),
+                     scale);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CosSimOpTest.cpp b/paddle/function/CosSimOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48c815f027161b48c17ce654ab819156fd856199
--- /dev/null
+++ b/paddle/function/CosSimOpTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testCosSimForward(size_t height_x,
+                       size_t height_y,
+                       size_t width,
+                       real scale) {
+  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}),
+                  ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+void testCosSimBackward(size_t height_x,
+                        size_t height_y,
+                        size_t width,
+                        real scale) {
+  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  // prepare input arguments
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}),
+                  ADD_TO);
+  // run Function
+  test.run();
+}
+
+TEST(Matrix, cosSim) {
+  for (auto height_x : {10, 100, 1000}) {
+    for (auto height_y : {1, height_x}) {
+      for (auto width : {10, 100, 1000}) {
+        for (auto scale : {1.0, 2.0}) {
+          testCosSimForward(height_x, height_y, width, scale);
+          testCosSimBackward(height_x, height_y, width, scale);
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef878bfbba961bdd3d5212e19fb83bb1e285e47f
--- /dev/null
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -0,0 +1,344 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "CrossMapNormalOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
+                                     real* denoms,
+                                     const real* inputs,
+                                     size_t numSamples,
+                                     size_t channels,
+                                     size_t height,
+                                     size_t width,
+                                     size_t size,
+                                     real scale,
+                                     real pow) {
+  size_t oneImage = height * width;
+  size_t oneSample = channels * oneImage;
+
+  CpuVector outputsV(numSamples * oneSample, outputs);
+  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
+  CpuVector denomsV(numSamples * oneSample, denoms);
+
+  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
+  // x represents inputs
+  // f(x) represents outputs
+  // denoms save the intermediate result for backward
+  denomsV = denomsV.constant(1.0);
+  const int start = -((int)size - 1) / 2;
+  const int end = (int)size + start;
+  for (size_t i = 0; i < numSamples; i++) {
+    real* oneDenom = denoms + i * oneSample;
+    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
+    for (int c = 0; c < (int)channels; c++) {
+      CpuVector denom(oneImage, oneDenom + c * oneImage);
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
+          denom += input.square() * scale;
+        }
+      }
+    }
+  }
+
+  outputsV = inputsV * denomsV.pow(-pow);
+}
+
+template <>
+void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
+                                         const real* inputsValue,
+                                         const real* outputsValue,
+                                         const real* outputsGrad,
+                                         const real* denoms,
+                                         size_t numSamples,
+                                         size_t channels,
+                                         size_t height,
+                                         size_t width,
+                                         size_t size,
+                                         real scale,
+                                         real pow) {
+  size_t oneSample = channels * height * width;
+  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
+                                                         size_t offset) {
+    return CpuVector(height * width, data + offset);
+  };
+
+  const int start = -((int)size) / 2;
+  const int end = (int)size + start;
+  const real ratio = -(real)2 * scale * pow;
+  for (size_t i = 0; i < numSamples; i++) {
+    size_t sOffset = i * oneSample;
+    real* oneInputGrad = inputsGrad + sOffset;
+    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
+    real* oneDenom = const_cast<real*>(denoms) + sOffset;
+    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
+    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
+
+    for (int c = 0; c < (int)channels; c++) {
+      size_t cOffset = c * height * width;
+      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
+      CpuVector inputValue = oneImage(oneInputValue, cOffset);
+      CpuVector denom = oneImage(oneDenom, cOffset);
+      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
+
+      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
+      for (int s = start; s < end; s++) {
+        if (c + s >= 0 && c + s < (int)channels) {
+          size_t offset = (c + s) * height * width;
+          CpuVector output = oneImage(oneOutputValue, offset);
+          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
+          CpuVector denom = oneImage(oneDenom, offset);
+
+          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief Normalization with across maps.
+ *
+ * This Function comes from the paper
+ * "ImageNet Classification with Deep Convolutional Neural Networks".
+ *
+ * The original formula is:
+ *
+ *                                Input(i, x, y)
+ * Output(i, x, y) = ----------------------------------------------
+ *                                 -- upper
+ *                    (k + alpha * >  (Input(j, x, y))^2) ^ (beta)
+ *                                 -- j = lower
+ *
+ * upper is `min(C, c + N/2)`
+ * lower if `max(0, c - N/2)`
+ *
+ * Function implementation:
+ *
+ * inputs and outpus is NCHW format, while input.shape.ndims() is equal 4.
+ * And the meaning of each dimension(0-3) is respectively batch size,
+ * feature maps, rows and columns.
+ *
+ * Input and Output in the above formula is for each map(i) of one image, and
+ * Input(i, x, y), Output(i, x, y) represents an element in an image.
+ *
+ * C is the number of feature maps of one image, and N is a hyper-parameters
+ * is configured when Function is initialized. The sum in the denominator
+ * is the sum of the same position in the neighboring maps.
+ *
+ * In the implementation of Function, k is equal to 1,
+ * so Function has no argument for k.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent Input
+ * \param outputs[0] represent Output
+ * \param outputs[1] represent The denominator in the formula(except beta)
+ *
+ * Note:
+ * Save output[1] is to simplify the backward calculation.
+ * TODO, if only consider the forward calculation, we can optimize to
+ * remove the output[1].
+ */
+template <DeviceType Device>
+class CrossMapNormalFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 2;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    // ArgType check still on here,
+    // not sure whether it is better to put inside the check.
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    CHECK_EQ(outputs[1].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormal<Device>(outputs[0].data<real>(),
+                           outputs[1].data<real>(),
+                           inputs[0].data<real>(),
+                           batchSize,
+                           maps,
+                           rows,
+                           columns,
+                           size_,
+                           scale_,
+                           pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == outputs[0].shape());
+    CHECK(inputs[0].shape() == outputs[1].shape());
+  }
+
+  // Only need the shape of the input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ((size_t)numInputs_, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 2 + 3);
+
+    return ops;
+  }
+
+private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+/**
+ * \brief Backward calculation for normalization with across maps.
+ *
+ * Function implementation:
+ *
+ * The implementation of this Function is derived from the
+ * CrossMapNormalFunc implementation.
+ *
+ * InputGrad = OutputGrad * denoms ^ (-beta)
+ *    -- upper
+ *  + > (OutputGrad * OutputValue * (-2 * alpha * beta) / denoms) * InputValue
+ *    -- lower
+ *
+ * The data of inputs/outputs format is the same as the forward interface
+ * and is NCHW.
+ *
+ * The upper and lower is the same as forward. The logic of the sum
+ * is also the same as forward.
+ *
+ * Function Arguments:
+ *
+ * \param size_      represent N
+ * \param scale_     represent alpha
+ * \param pow_       represent beta
+ * \param inputs[0]  represent InputValue, inputs[0] of CrossMapNormalFunc
+ * \param inputs[1]  represent OutputValue, outputs[0] of CrossMapNormalFunc
+ * \param inputs[2]  represent OutputGrad
+ * \param inputs[3]  represent denoms, outputs[1] of CrossMapNormalFunc
+ *                   This is the intermediate result that is
+ *                   preserved in the forward calculation.
+ * \param outputs[0] represent InputGrad
+ */
+template <DeviceType Device>
+class CrossMapNormalGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    size_ = config.get<size_t>("size");
+    scale_ = config.get<real>("scale");
+    pow_ = config.get<real>("pow");
+
+    // number of inputs and outputs
+    numInputs_ = 4;
+    numOutputs_ = 1;
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    check(inputs, outputs);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    CrossMapNormalGrad<Device>(outputs[0].data<real>(),
+                               inputs[0].data<real>(),
+                               inputs[1].data<real>(),
+                               inputs[2].data<real>(),
+                               inputs[3].data<real>(),
+                               batchSize,
+                               maps,
+                               rows,
+                               columns,
+                               size_,
+                               scale_,
+                               pow_);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)4);
+    CHECK(inputs[0].shape() == inputs[1].shape());
+    CHECK(inputs[0].shape() == inputs[2].shape());
+    CHECK(inputs[0].shape() == inputs[3].shape());
+    CHECK(inputs[0].shape() == outputs[0].shape());
+  }
+
+  // Only need the shape of one input, can calculate the
+  // floating-point operation.
+  size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_LT((size_t)1, inputs.size());
+    size_t batchSize = inputs[0].shape()[0];
+    size_t maps = inputs[0].shape()[1];
+    size_t rows = inputs[0].shape()[2];
+    size_t columns = inputs[0].shape()[3];
+
+    // number of floating-point operations
+    // an approximate value
+    size_t ops = batchSize * maps * rows * columns * (size_ * 4 + 2);
+
+    return ops;
+  }
+
+private:
+  size_t size_;
+  real scale_;
+  real pow_;
+};
+
+REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
+REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/cross_map_normal_op.h b/paddle/function/CrossMapNormalOp.h
similarity index 100%
rename from paddle/function/cross_map_normal_op.h
rename to paddle/function/CrossMapNormalOp.h
diff --git a/paddle/function/cross_map_normal_op_gpu.cu b/paddle/function/CrossMapNormalOpGpu.cu
similarity index 99%
rename from paddle/function/cross_map_normal_op_gpu.cu
rename to paddle/function/CrossMapNormalOpGpu.cu
index aae4f461b6f57de6cadfe7c3a6d684c613cc037f..b33dd108348b7789c6e73bfe3b1ffbc448163ef7 100644
--- a/paddle/function/cross_map_normal_op_gpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "hl_base.h"
-#include "cross_map_normal_op.h"
+#include "CrossMapNormalOp.h"
 
 namespace paddle {
 
diff --git a/paddle/function/cross_map_normal_op_test.cpp b/paddle/function/CrossMapNormalOpTest.cpp
similarity index 53%
rename from paddle/function/cross_map_normal_op_test.cpp
rename to paddle/function/CrossMapNormalOpTest.cpp
index 22692691bdb64c23cbd2a479b2afb919672554f7..51f5da81bfc9ae870ac9949ba74da01a9449a04d 100644
--- a/paddle/function/cross_map_normal_op_test.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "FunctionTest.h"
 
+namespace paddle {
+
 TEST(CrossMapNormal, real) {
   for (size_t numSamples : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
@@ -25,15 +27,19 @@ TEST(CrossMapNormal, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormal",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
-                               {});
+            // init Test object
+            FunctionCompare test("CrossMapNormal",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            // prepare input arguments
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
@@ -51,21 +57,24 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormalGrad",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims)},
-                               {});
+            FunctionCompare test("CrossMapNormalGrad",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
     }
   }
 }
+
+}  // namespace paddle
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 02880e5ea1acb85d8685f865a5745f7090db03d2..f47d55a4ade97d76e0f1940a2234e34e20efade6 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -30,20 +30,76 @@ real FuncConfig::get<real>(const std::string& key) const {
   return it->second.r;
 }
 
+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
+
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
-  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].s = v;
   return *this;
 }
 
 template <>
 FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
-  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].r = v;
   return *this;
 }
 
+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
+  valueMap_[key].b = v;
+  return *this;
+}
+
+void BufferArgs::addArg(const Matrix& arg,
+                        const TensorShape& shape,
+                        ArgType argType) {
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
+}
+
+void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
+}
+
+void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
+}
+
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 095584c0b19f7a0b7d8787a0bc6bbdd78d785eed..3bbeb6e525f85bdde9a54c8d60146eaa30a1bb4d 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -16,49 +16,24 @@ limitations under the License. */
 
 #include <map>
 #include <vector>
+#include "BufferArg.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 
-enum DeviceType {
-  DEVICE_TYPE_UNSPECIFIED = 0,
-  DEVICE_TYPE_CPU = 1,
-  DEVICE_TYPE_GPU = 2,
-};
-
-template <DeviceType Device>
-struct MatrixT;
-
-template <>
-struct MatrixT<DEVICE_TYPE_CPU> {
-  using type = CpuMatrix;
-};
-
-template <>
-struct MatrixT<DEVICE_TYPE_GPU> {
-  using type = GpuMatrix;
-};
-
-typedef std::vector<size_t> Dims;
-
-class Tensor {
-public:
-  Tensor(real* data, const Dims& dim) : buf_(data), dims_(dim) {}
-
-  real* getData() const { return buf_; }
-
-  real* buf_;
-  Dims dims_;
-};
-
-typedef std::vector<Tensor> Arguments;
-
+/**
+ * Function Configuration.
+ * The argument type of Function::init.
+ * Follow-up will consider moving this data structure to Proto inside.
+ */
 class FuncConfig {
 public:
   union value {
     size_t s;
     real r;
+    int i;
+    bool b;
   };
 
   template <typename T>
@@ -71,17 +46,143 @@ protected:
   std::map<std::string, value> valueMap_;
 };
 
+/**
+ * Argument type for Function::calc().
+ * A BufferArgs contains a set of BufferArg,
+ * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
+ */
+class BufferArgs {
+public:
+  BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
+  size_t size() const { return args_.size(); }
+
+  // add argument into BufferArgs
+  // Tensor can be Matrix, Vector, IVector.
+  // For inputs, do not need argType.
+  // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  // Add arg into BufferArgs and reshape the arg.
+  //
+  // For example, arg represents an image buffer,
+  // but Matrix can only represent a two-dimensional Tensor.
+  // So need an extra argument to describe the shape of the image buffer.
+  void addArg(const Matrix& arg,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED);
+
+  void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+  void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
+
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
+  // get argument
+  const BufferArg& operator[](size_t num) const {
+    CHECK_LT(num, args_.size());
+    return *args_[num];
+  }
+
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
+private:
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
+};
+
+/**
+ * \brief Base class for Function.
+ * The basic Function implementation requires override init and calc interfaces.
+ *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
+ * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
+ * and ADD_TO.
+ * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
+ * result of Function assigned to the output BufferArg.
+ * If output.getArgType() == ADD_TO, this is add mode, and the calculation
+ * result of Function need added to the output BufferArg.
+ *
+ * For example:
+ * ASSIGN_TO: output = Function(inputs)
+ * ADD_TO: output += Function(inputs)
+ * If Function has more than one output, each output can have different modes.
+ */
 class FunctionBase {
 public:
   virtual ~FunctionBase() {}
 
   virtual void init(const FuncConfig& config) {}
 
-  virtual void calc(const Arguments& inputs,
-                    const Arguments& outputs,
-                    const Arguments& inouts) {}
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // This member function is used to check whether the BufferType and shape of
+  // the inputs and outputs arguments of the Function are correct.
+  // General calc function which will call this check to do arguments check.
+  // And before the calc called, the caller can also check their own arguments.
+  virtual void check(const BufferArgs& inputs, const BufferArgs& outputs) {}
+
+  // Calculate the number of floating-point operations of this Function.
+  // The inputs and outputs arguments do not need to contain the actual data,
+  // only the shape.
+  // And some Functions have the same input and output shapes,
+  // so you may not need to enter the complete number of arguments.
+  // But entering the full arguments is always correct for this interface.
+  virtual size_t ops(const BufferArgs& inputs, const BufferArgs& outputs) {
+    return 0;
+  }
+
+  int getNumInputs() const { return numInputs_; }
+
+  int getNumOutputs() const { return numOutputs_; }
 
   static ClassRegistrar<FunctionBase> funcRegistrar_;
+
+protected:
+  // numInputs_ and numOutputs_ represents the maximum
+  // input and output supported by Function.
+  // Some functions are optimized for input and output,
+  // so when comparing the number of arguments, for these functions
+  // inputs.size() <= numInputs_ or outputs.size() <= numOutputs_
+  size_t numInputs_;
+  size_t numOutputs_;
 };
 
 #define FUNC_NAME(typeName, deviceName) #typeName "-" #deviceName
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdf7e631e5ab8c67eb5cf906bd0af49740d60112
--- /dev/null
+++ b/paddle/function/FunctionTest.cpp
@@ -0,0 +1,166 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+
+template <DeviceType DType>
+void FunctionApi(typename Tensor<real, DType>::Matrix& output,
+                 const typename Tensor<real, DType>::Matrix& input);
+
+template <>
+void FunctionApi<DEVICE_TYPE_CPU>(CpuMatrix& output, const CpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 100);
+  EXPECT_EQ(output.getWidth(), 200);
+}
+
+template <>
+void FunctionApi<DEVICE_TYPE_GPU>(GpuMatrix& output, const GpuMatrix& input) {
+  EXPECT_EQ(output.getHeight(), 10);
+  EXPECT_EQ(output.getWidth(), 20);
+}
+
+template <DeviceType DType>
+void Function(const BufferArgs& arguments) {
+  const auto input = arguments[0].matrix<DType>();
+  auto output = arguments[1].matrix<DType>();
+  FunctionApi<DType>(output, input);
+}
+
+TEST(Function, BufferArgs) {
+  CpuMatrix cpuInput = CpuMatrix(100, 200);
+  CpuMatrix cpuOutput = CpuMatrix(100, 200);
+  BufferArgs cpuArgments;
+  cpuArgments.addArg(cpuInput);
+  cpuArgments.addArg(cpuOutput);
+  Function<DEVICE_TYPE_CPU>(cpuArgments);
+
+  GpuMatrix gpuInput = GpuMatrix(10, 20);
+  GpuMatrix gpuOutput = GpuMatrix(10, 20);
+  BufferArgs gpuArgments;
+  gpuArgments.addArg(gpuInput);
+  gpuArgments.addArg(gpuOutput);
+  Function<DEVICE_TYPE_GPU>(gpuArgments);
+}
+
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index a8c5e412bd12df2ea0b4d6bd67072fb7d08591fe..0cfafdb27f55a3e6617d31a968d2a05fc77f5b46 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -13,90 +13,336 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Function.h"
-#include "paddle/math/Vector.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 
 namespace paddle {
 
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
-    gpu->init(config);
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpuFunc_->init(config);
+    gpuFunc_->init(config);
   }
 
-  void cmpWithArg(const Arguments& inputs,
-                  const Arguments& outputs,
-                  const Arguments& inouts) {
-    // init cpu and gpu arguments
-    auto initArgs = [=](
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (auto arg : inArgs) {
-        size_t size = sizeof(real);
-        for (auto dim : arg.dims_) {
-          size *= dim;
-        }
-        cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-        gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-        cpuArgs.emplace_back(
-            Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-        gpuArgs.emplace_back(
-            Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-
-        // will use an api to refactor this code.
-        CpuVector cpuVector(size / sizeof(real),
-                            (real*)cpuArgs.back().getData());
-        GpuVector gpuVector(size / sizeof(real),
-                            (real*)gpuArgs.back().getData());
-        cpuVector.uniform(0.001, 1);
-        gpuVector.copyFrom(cpuVector);
-      }
-    };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
+  ~FunctionCompare() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // assume one copy of sequence is shared by different SequenceArgs
+  void addSequence(const SequenceIdArg& input) {
+    CHECK_EQ(input.shape().ndims(), 1UL);
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
+                                              TensorShape{numSeqs + 1});
+    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
+                                              TensorShape{numSeqs + 1});
+    /// init sequence Id
+    initArg(*cpuSeq_, batchSize);
+
+    // todo(tianbing), delete it
+    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
+
+    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
+    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
+    gpuSeq.copyFrom(cpuSeq);
+  }
+
+  void addInputs(const SequenceArg& input) {
+    CHECK_EQ(input.shape().ndims(), 2UL);
+    size_t batchSize = input.shape()[0];
+    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    /// SequenceArg
+    cpuInputs_.emplace_back(
+        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *cpuSeq_));
+    gpuInputs_.emplace_back(
+        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      input.valueType(),
+                                      input.shape(),
+                                      *gpuSeq_));
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    argType));
+  }
+
+  /// add and init output sparse matrix
+  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
+    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+        output.shape()[0],
+        output.shape()[1],
+        output.nnz(),
+        static_cast<SparseValueType>(output.dataType()),
+        static_cast<SparseFormat>(output.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    cpuSparse_->randomizeUniform();
+    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    hl_stream_synchronize(stream);
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+  }
+
+  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
+    CHECK_EQ(output.shape().ndims(), 2UL);
+    size_t batchSize = output.shape()[0];
+
+    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+      addSequence(SequenceIdArg(TensorShape{batchSize}));
+    }
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    /// SequenceArg
+    cpuOutputs_.emplace_back(
+        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *cpuSeq_,
+                                      argType));
+    gpuOutputs_.emplace_back(
+        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+                                      output.valueType(),
+                                      output.shape(),
+                                      *gpuSeq_,
+                                      argType));
+  }
+
+  void addInputs(const SparseMatrixArg& input) {
+    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+        input.shape()[0],
+        input.shape()[1],
+        input.nnz(),
+        static_cast<SparseValueType>(input.dataType()),
+        static_cast<SparseFormat>(input.dataFormat()));
+
+    /// init sparse matrix
+    hl_stream_t stream(HPPL_STREAM_1);
+    cpuSparse_->randomizeUniform();
+    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    hl_stream_synchronize(stream);
+
+    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
+    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
 
+    initOutputs();
     // function calculate
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
-
-    // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
-          size *= dim;
-        }
-        CpuVector cpuVector(size, (real*)cpu.getData());
-        GpuVector gpuVector(size, (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
       }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
     };
-    checkArgs(cpuOutputs, gpuOutputs);
-    checkArgs(cpuInouts, gpuInouts);
+
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+
+    // check outputs
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+
+protected:
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceArg& arg) {
+    /// init only matrix
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
+
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = 1 + uniformRandom(std::min<int64_t>(
+                        maxLen, batchSize - pos - numSeqs + i));
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
+
+  void initInputs() {
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      if (cpuInputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (cpuInputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+      } else {
+        initArg(*cpuInputs_[i]);
+      }
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
+                          (real*)cpuInputs_[i]->data());
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
+
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+
+  void initOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      if (cpuOutputs_[i]->isSparseArg()) {
+        continue;  /// sparse matrix already init
+      }
+
+      if (cpuOutputs_[i]->isSequenceArg()) {
+        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+      } else {
+        initArg(*cpuOutputs_[i]);
+      }
+
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
+                          (real*)cpuOutputs_[i]->data());
+      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
+                          (real*)gpuOutputs_[i]->data());
+
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      const auto cpu = cpuOutputs_[i];
+      const auto gpu = gpuOutputs_[i];
+      CHECK_EQ(cpu->numElements(), gpu->numElements());
+      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
+      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
   }
 
 protected:
-  std::shared_ptr<FunctionBase> cpu;
-  std::shared_ptr<FunctionBase> gpu;
-  std::vector<CpuMemHandlePtr> cpuMemory;
-  std::vector<GpuMemHandlePtr> gpuMemory;
-  Arguments cpuInputs;
-  Arguments cpuOutputs;
-  Arguments cpuInouts;
-  Arguments gpuInputs;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
+  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> gpuOutputs_;
+  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
+  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
+  std::shared_ptr<SequenceIdArg> cpuSeq_;
+  std::shared_ptr<SequenceIdArg> gpuSeq_;
 };
 
 }  // namespace paddle
-
-using paddle::FunctionCompare;
-using paddle::FuncConfig;
-using paddle::Dims;
-using paddle::Tensor;
diff --git a/paddle/function/MulOp.cpp b/paddle/function/MulOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..91b4b8ed91b6055babcfbab8f7adb2c55e2747d0
--- /dev/null
+++ b/paddle/function/MulOp.cpp
@@ -0,0 +1,354 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MulOp.h"
+/// todo(tianbing), delete it
+#include <iostream>
+#include "paddle/math/MathFunctions.h"
+#include "paddle/math/SIMDFunctions.h"
+#include "paddle/utils/ThreadLocal.h"
+
+#ifndef PADDLE_TYPE_DOUBLE
+#define GEMM paddle::gemm<float>
+#else
+#define GEMM paddle::gemm<double>
+#endif
+
+namespace {
+inline void vecAddTo(real* a, const real* b, real scaleB, size_t len) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i] += (1.0 == scaleB) ? b[i] : scaleB * b[i];
+  }
+}
+
+inline void colVecAddTo(
+    real* a, real* b, real c, size_t len, size_t aWidth, size_t bWidth) {
+  for (unsigned int i = 0; i < len; ++i) {
+    a[i * aWidth] += (1.0 == c) ? b[i * bWidth] : b[i * bWidth] * c;
+  }
+}
+}  // namespace
+
+namespace paddle {
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuSparseMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK_EQ(out.getValueType(), FLOAT_VALUE);
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* A = a.getData();
+  const real* B = b.getData();
+  real* C = out.getValue();
+  int* rows = out.getRows();
+  int* cols = out.getCols();
+  size_t width = out.getWidth();
+  size_t height = out.getHeight();
+
+  /// SPARSE_CSC, {a any, b not trans}
+  if (out.getFormat() == SPARSE_CSC) {
+    /// b not trans and a any
+    CHECK(!bTrans);
+    size_t m = !aTrans ? a.getWidth() : a.getHeight();
+    for (size_t i = 0; i < width; i++) {
+      size_t start = out.getColStartIdx(i);
+      size_t end = out.getColStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t rowIdx = rows[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[rowIdx * m + k] : A[k * height + rowIdx]) *
+                 B[k * width + i];
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR, {a any, b not trans} or {a not trans, b trans}
+  if (out.getFormat() == SPARSE_CSR) {
+    /// a and b can not both transpose
+    CHECK(!(aTrans && bTrans));
+    size_t m = a.getWidth();
+    for (size_t i = 0; i < height; i++) {
+      size_t start = out.getRowStartIdx(i);
+      size_t end = out.getRowStartIdx(i + 1);
+      for (size_t j = start; j < end; j++) {
+        real sum = 0;
+        size_t colIdx = cols[j];
+        for (size_t k = 0; k < m; k++) {
+          sum += (!aTrans ? A[i * m + k] : A[k * height + i]) *
+                 (!bTrans ? B[k * width + colIdx] : B[colIdx * m + k]);
+        }
+        C[j] = scaleAB * sum + scaleT * C[j];
+      }
+    }
+    return;
+  }
+}
+
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  GEMM(aTrans ? CblasTrans : CblasNoTrans,
+       bTrans ? CblasTrans : CblasNoTrans,
+       out.getHeight(),
+       out.getWidth(),
+       !aTrans ? a.getWidth() : a.getHeight(),
+       scaleAB,
+       a.getData(),
+       a.getStride(),
+       b.getData(),
+       b.getStride(),
+       scaleT,
+       out.getData(),
+       out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuSparseMatrix& a,
+                            const CpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  const real* B = b.getData();
+  real* C = out.getData();
+  if (out.getWidth() % 32 == 0) {
+    CHECK_EQ((size_t)B % 32, 0UL);
+    CHECK_EQ((size_t)C % 32, 0UL);
+  }
+
+  int* cols = a.getCols();
+  real* values = a.getValue();
+  for (size_t i = 0; i < a.getHeight(); ++i) {
+    const int start = a.getRowStartIdx(i);
+    const int end = a.getRowStartIdx(i + 1);
+    for (int j = start; j < end; ++j) {
+      vecAddTo(!aTrans ? out.getRow(i) : out.getRow(cols[j]),
+               !aTrans ? const_cast<CpuMatrix&>(b).getRow(cols[j])
+                       : const_cast<CpuMatrix&>(b).getRow(i),
+               (a.getValueType() == FLOAT_VALUE) ? values[j] : (real)1.0,
+               out.getWidth());
+    }
+  }
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                            const CpuMatrix& a,
+                            const CpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  if (scaleT == 0) {
+    out.zeroMem();
+  }
+  real* A = const_cast<real*>(a.getData());
+  real* B = const_cast<real*>(b.getValue());
+  real* C = out.getData();
+  int* rows = b.getRows();
+  int* cols = b.getCols();
+
+  /// SPARSE_CSC format
+  if (b.getFormat() == SPARSE_CSC) {
+    for (size_t j = 0; j < b.getWidth(); ++j) {
+      int start = b.getColStartIdx(j);
+      int end = b.getColStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + j : C + rows[i],
+                    !bTrans ? A + rows[i] : A + j,
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+
+  /// SPARSE_CSR format
+  if (b.getFormat() == SPARSE_CSR) {
+    for (size_t j = 0; j < b.getHeight(); ++j) {
+      int start = b.getRowStartIdx(j);
+      int end = b.getRowStartIdx(j + 1);
+      for (int i = start; i < end; ++i) {
+        colVecAddTo(!bTrans ? C + cols[i] : C + j,
+                    !bTrans ? A + j : A + cols[i],
+                    (b.getValueType() == NO_VALUE) ? (real)1.0 : B[i],
+                    out.getHeight(),
+                    out.getWidth(),
+                    a.getWidth());
+      }
+    }
+    return;
+  }
+}
+
+/**
+ * mul operator
+ * out = scaleT * out + scaleAB * (A * B)
+ * here, scaleT in {0, 1}, scaleAB == 1,
+ * out = A * B, ASSIGN_TO
+ * out += A * B, ADD_TO
+ *
+ *
+ * \param outputs[0]      output matrix (out), M * N,
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, N is num of columns
+ * \param inputs[0]       first input matrix (A),  M * K (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        M is num of rows, K is num of columns
+ * \param inputs[1]       second input matrix (B), K * N (if non-trans)
+ *                        could be either Sparse or Dense Matrix
+ *                        K is num of rows, N is num of columns
+ *
+ * Support eight Mul operators, with both GPU and CPU devices
+ * For each device, four Mul operators are supported:
+ * 1. dense (out) = dense (A) * dense (B)
+ * 2. dense (out) = sparse (A) * dense (B)
+ *    sparse matrix only support SPARSE_CSR format
+ * 3. dense (out) = dense (A) * sparse (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ * 4. sparse (out) = dense (A) * dense (B)
+ *    sparse matrix support SPARSE_CSC and SPARSE_CSR formats
+ *
+ */
+template <DeviceType Device>
+class MulFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    aTrans_ = config.get<bool>("aTrans");
+    bTrans_ = config.get<bool>("bTrans");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK(!aTrans_ || !bTrans_)
+        << "Not support both a and b are transpose matrices";
+
+    CHECK_EQ((size_t)2, inputs.size());
+    CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].data() && inputs[1].data() && outputs[0].data());
+    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
+    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
+
+    size_t aRow = !aTrans_ ? inputs[0].shape()[0] : inputs[0].shape()[1];
+    size_t aCol = !aTrans_ ? inputs[0].shape()[1] : inputs[0].shape()[0];
+    size_t bRow = !bTrans_ ? inputs[1].shape()[0] : inputs[1].shape()[1];
+    size_t bCol = !bTrans_ ? inputs[1].shape()[1] : inputs[1].shape()[0];
+    /// C = A * B, or C += A * B, for matrix format
+    CHECK_EQ(aCol, bRow);
+    CHECK_EQ(aRow, outputs[0].shape()[0]);
+    CHECK_EQ(bCol, outputs[0].shape()[1]);
+
+    /// only support C = A * B (ASSIGN_TO) or C += A * B (ADD_TO)
+    real scaleT = (outputs[0].getArgType() == ADD_TO) ? 1.0 : 0.0;
+
+    /// support dense = not both sparse * sparse
+    /// or sparse = dense * dense
+    CHECK((!outputs[0].isSparseArg() &&
+           !(inputs[0].isSparseArg() && inputs[1].isSparseArg())) ||
+          (outputs[0].isSparseArg() && !inputs[0].isSparseArg() &&
+           !inputs[1].isSparseArg()));
+
+    auto outMat = outputs[0].matrix<Device>();
+    /// dense matrix = dense matrix * dense matrix
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = dense matrix * sparse matrix
+    if (!inputs[0].isSparseArg() && inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!aTrans_) << "Not supported a transpose";
+      MulOp<Device>(outMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].sparse().SparseMatrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// dense matrix = sparse matrix * dense matrix
+    if (inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        !outputs[0].isSparseArg()) {
+      CHECK(!bTrans_) << "Not supported b transpose";
+      CHECK_EQ(inputs[0].sparse().dataFormat(), T_SPARSE_CSR)
+          << "Only supported SPARSE_CSR format for sparse matrix a";
+      MulOp<Device>(outMat,
+                    inputs[0].sparse().SparseMatrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+
+    /// sparse matrix = dense matrix * dense matrix
+    auto outSparseMat = outputs[0].sparse().SparseMatrix<Device>();
+    if (!inputs[0].isSparseArg() && !inputs[1].isSparseArg() &&
+        outputs[0].isSparseArg()) {
+      MulOp<Device>(outSparseMat,
+                    inputs[0].matrix<Device>(),
+                    inputs[1].matrix<Device>(),
+                    1.0,  // scaleAB
+                    scaleT,
+                    aTrans_,
+                    bTrans_);
+      return;
+    }
+  }
+
+private:
+  bool aTrans_;
+  bool bTrans_;
+};
+
+REGISTER_TYPED_FUNC(MulOp, CPU, MulFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(MulOp, GPU, MulFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/MulOp.h b/paddle/function/MulOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6016a6ab6e9d6549b359573ecc2b33900a58365
--- /dev/null
+++ b/paddle/function/MulOp.h
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+/// CPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuSparseMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(CpuMatrix& out,
+           const CpuMatrix& a,
+           const CpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// CPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(CpuSparseMatrix& out,
+           const CpuMatrix& a,
+           const CpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= sparse matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuSparseMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, dense matrix (+)= dense matrix * sparse matrix
+template <DeviceType DType>
+void MulOp(GpuMatrix& out,
+           const GpuMatrix& a,
+           const GpuSparseMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+/// GPU, sparse matrix (+)= dense matrix * dense matrix
+template <DeviceType DType>
+void MulOp(GpuSparseMatrix& out,
+           const GpuMatrix& a,
+           const GpuMatrix& b,
+           real scaleAB,
+           real scaleT,
+           bool aTrans,
+           bool bTrans);
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpGpu.cu b/paddle/function/MulOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..dcfcb2325d7dae22e0e0e78fc0bddf061fc0940c
--- /dev/null
+++ b/paddle/function/MulOpGpu.cu
@@ -0,0 +1,130 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "MulOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+
+namespace paddle {
+/// dense matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_mul(const_cast<real*>(a.getData()),
+                !aTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(b.getData()),
+                !bTrans ? HPPL_OP_N : HPPL_OP_T,
+                const_cast<real*>(out.getData()),
+                out.getHeight(),
+                out.getWidth(),
+                !aTrans ? a.getWidth() : a.getHeight(),
+                scaleAB,
+                scaleT,
+                a.getStride(),
+                b.getStride(),
+                out.getStride());
+}
+
+/// dense matrix (+)= sparse matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuSparseMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(b.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_matrix_csr_mul_dense(a.sMatrix_.get(),
+                          aTrans ? HPPL_OP_T : HPPL_OP_N,
+                          const_cast<real*>(b.getData()),
+                          HPPL_OP_N,
+                          const_cast<real*>(out.getData()),
+                          out.getHeight(),
+                          out.getWidth(),
+                          b.getHeight(),
+                          scaleAB,
+                          scaleT);
+}
+
+/// dense matrix (+)= dense matrix * sparse matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuSparseMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(out.isContiguous());
+  CHECK(a.isContiguous());
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+
+  if (b.format_ == SPARSE_CSC) {
+    hl_matrix_dense_mul_csc(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  } else {
+    hl_matrix_dense_mul_csr(const_cast<real*>(a.getData()),
+                            HPPL_OP_N,
+                            b.sMatrix_.get(),
+                            bTrans ? HPPL_OP_T : HPPL_OP_N,
+                            const_cast<real*>(out.getData()),
+                            out.getHeight(),
+                            out.getWidth(),
+                            a.getWidth(),
+                            scaleAB,
+                            scaleT);
+  }
+}
+
+/// sparse matrix (+)= dense matrix * dense matrix
+template <>
+void MulOp<DEVICE_TYPE_GPU>(GpuSparseMatrix& out,
+                            const GpuMatrix& a,
+                            const GpuMatrix& b,
+                            real scaleAB,
+                            real scaleT,
+                            bool aTrans,
+                            bool bTrans) {
+  CHECK(a.useGpu_ && b.useGpu_) << "matrix device type not match";
+  hl_sparse_matrix_mul(const_cast<real*>(a.getData()),
+                       aTrans ? HPPL_OP_T : HPPL_OP_N,
+                       const_cast<real*>(b.getData()),
+                       bTrans ? HPPL_OP_T : HPPL_OP_N,
+                       out.sMatrix_.get(),
+                       out.getHeight(),
+                       out.getWidth(),
+                       !bTrans ? b.getHeight() : b.getWidth(),
+                       scaleAB,
+                       scaleT);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8748eb0d79fa0fcb0935eac5bb37b44274128aa0
--- /dev/null
+++ b/paddle/function/MulOpTest.cpp
@@ -0,0 +1,212 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/tests/test_matrixUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+/**
+ *  C += A * B, A, B, C dense matrix
+ *  dense = dense * dense
+ */
+void testFuncDDDMatrix(
+    bool transa, bool transb, size_t dimM, size_t dimN, size_t dimK) {
+  real scaleT = 1.0;
+  size_t heightA = (transa == false) ? dimM : dimK;
+  size_t widthA = (transa == false) ? dimK : dimM;
+  size_t heightB = (transb == false) ? dimK : dimN;
+  size_t widthB = (transb == false) ? dimN : dimK;
+  size_t heightC = dimM;
+  size_t widthC = dimN;
+  // init Test object
+  FunctionCompare test(
+      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
+  // prepare input arguments
+  /// matrix A : HA * WA
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightA, widthA}));
+  /// matrix B: HB * WB
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightB, widthB}));
+
+  /// output matrix C: HC * WC
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{heightC, widthC}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDDMatrixMul) {
+  LOG(INFO) << "function test for dense = dense * dense matrix";
+  for (const auto transa : {false, true}) {
+    for (const auto transb : {false, true}) {
+      for (const auto dimM : {1, 10, 100}) {
+        for (const auto dimN : {1, 10}) {
+          for (const auto dimK : {8}) {
+            if (transa && transb) {
+              continue;
+            }
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " transa=" << transa << " transb=" << transb
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK;
+            testFuncDDDMatrix(transa, transb, dimM, dimN, dimK);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, B, C dense, A sparse
+  * dense = sparse * dense
+  */
+void testFuncDSparseDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// sparse matrix A : M * K
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}, nnz, FORMAT, FLOAT_VALUE));
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MuLOp, DSparseDMul) {
+  LOG(INFO) << "function test for dense = sparse * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDSparseDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, A, C dense, B sparse
+  * dense = dense * sparse
+  */
+void testFuncDDSparseMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(SparseMatrixArg(
+      VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}, nnz, FORMAT, FLOAT_VALUE));
+
+  /// output matrix C: M * N
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}),
+                  scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, DDSparseMul) {
+  LOG(INFO) << "function test for dense = dense * sparse matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSR, SPARSE_CSC}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncDDSparseMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+  * C += A * B, A sparse, B, C dense
+  * sparse = dense * dense
+  */
+void testFuncSparseDDMatrix(
+    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
+  real scaleT = 1.0;
+  // init Test object
+  FunctionCompare test("MulOp",
+                       FuncConfig().set("aTrans", false).set("bTrans", false));
+  // prepare input arguments
+  /// matrix A : M * K
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
+
+  /// matrix B: K * N
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimK, dimN}));
+
+  /// output sparse matrix C: M * N
+  test.addOutputs(
+      SparseMatrixArg(
+          VALUE_TYPE_FLOAT, TensorShape{dimM, dimN}, nnz, FORMAT, FLOAT_VALUE),
+      scaleT == 1.0 ? ADD_TO : ASSIGN_TO);
+  // run Function
+  test.run();
+}
+
+TEST(MulOp, SparseDDMul) {
+  LOG(INFO) << "function test for sparse = dense * dense matrix";
+  for (const auto dimM : {10, 100, 1000}) {
+    for (const auto dimN : {10, 100}) {
+      for (const auto dimK : {3, 10}) {
+        for (const auto nnz : {3, 10}) {
+          for (const auto FORMAT : {SPARSE_CSC, SPARSE_CSR}) {
+            VLOG(3) << std::setiosflags(std::ios::left) << std::setfill(' ')
+                    << " dimM=" << std::setw(5) << dimM
+                    << " dimN=" << std::setw(5) << dimN
+                    << " dimK=" << std::setw(5) << dimK
+                    << " nnz=" << std::setw(5) << nnz
+                    << " format=" << std::setw(5) << FORMAT;
+            testFuncSparseDDMatrix(dimM, dimN, dimK, nnz, FORMAT);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1a0d2a1a96f24ddff8cd120681a8bc8cddaf40a
--- /dev/null
+++ b/paddle/function/PadOp.cpp
@@ -0,0 +1,223 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadOp.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void Pad<DEVICE_TYPE_CPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        memcpy(outputs + outoff, inputs + inoff, inW * sizeof(real));
+      }
+    }
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_CPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  for (int i = 0; i < num; i++) {
+    for (int c = 0; c < inC; c++) {
+      for (int h = 0; h < inH; h++) {
+        int inoff = ((i * inC + c) * inH + h) * inW;
+        int outoff =
+            ((i * outC + c + cstart) * outH + h + hstart) * outW + wstart;
+        CpuVector inG = CpuVector(inW, inGrad + inoff);
+        CpuVector outG = CpuVector(inW, const_cast<real*>(outGrad + outoff));
+        inG += outG;
+      }
+    }
+  }
+}
+
+/**
+ * \brief Padding zeros to input according to the specify dimension.
+ *        The struct pad_ contains the padding size in each dimension.
+ *        The input and output is a 4D tensor. In PadFunc, we only
+ *        pad zeros to the 2nd to 4th dimension.
+ *
+ * Argument in this Function:
+ * \param pad_    A struct object contains the padding size in each dimension.
+ *                It has six integers. The channelStart and channelEnd indicate
+ *                how many zeros to add before and after the input in channel
+ *                dimension. And the heightStart and heightEnd indicate padding
+ *                in height dimension. The widthStart and widthEnd indicate the
+ *                padding in width dimension.
+ * \param inputs  A 4D tensor, only one input.
+ * \param outputs A 4D tensor, the output value after padding.
+ *
+ * For example,
+ * Input(2,2,2,3) = [
+ *                    [ [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]] ],
+ *                    [ [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]] ]
+ *                  ] # the shape is (1,2,2,3)
+ *
+ * pad_: if channelStart = channelEnd = 1, others are 0.
+ * Output(2,4,2,3) = [
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[1,2,3], [3,4,5]],
+ *                      [[2,3,5], [1,6,7]],
+ *                      [[0,0,0], [0,0,0]] ],
+ *                    [ [[0,0,0], [0,0,0]],
+ *                      [[4,3,1], [1,8,7]],
+ *                      [[3,8,9], [2,3,5]],
+ *                      [[0,0,0], [0,0,0]] ]
+ *                   ] # the shape is (2,4,2,3)
+ *
+ * pad_: if widthStart = 1, widthEnd = 2, others are 0.
+ * Output(2,2,2,6) = [
+ *                     [ [[0,1,2,3,0,0], [0,3,4,5,0,0]],
+ *                       [[0,2,3,5,0,0], [0,1,6,7,0,0]] ],
+ *                     [ [[0,4,3,1,0,0], [0,1,8,7,0,0]],
+ *                       [[0,3,8,9,0,0], [0,2,3,5,0,0]] ],
+ *                   ] # the shape is (2,2,2,6)
+ *
+ * pad_: if heightStart = 1, heightEnd = 1, others are 0.
+ * Output(2,2,4,3) = [
+ *                     [ [[0,0,0], [1,2,3], [3,4,5], [0,0,0]],
+ *                       [[0,0,0], [2,3,5], [1,6,7], [0,0,0]] ],
+ *                     [ [[0,0,0], [4,3,1], [1,8,7], [0,0,0]],
+ *                       [[0,0,0], [3,8,9], [2,3,5], [0,0,0]] ],
+ *                   ] # the shape is (2,2,4,3)
+ */
+
+template <DeviceType Device>
+class PadFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    pad_.channelStart = config.get<int>("cstart");
+    pad_.channelEnd = config.get<int>("cend");
+    pad_.heightStart = config.get<int>("hstart");
+    pad_.heightEnd = config.get<int>("hend");
+    pad_.widthStart = config.get<int>("wstart");
+    pad_.widthEnd = config.get<int>("wend");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
+                num,
+                inC,
+                inH,
+                inW,
+                pad_);
+  }
+
+private:
+  PadConf pad_;
+};
+
+/**
+ * \brief The backward propagation of padding Function. Remove the elements
+ *        in the padding positions of forward.
+ *
+ * Argument in this Function:
+ * \param pad_    The same meaning as it in PadFunc.
+ * \param inputs  The gradient with respect to the output value of PadFunc.
+ * \param outputs The gradient with respect to the input value of PadFunc.
+ */
+
+template <DeviceType Device>
+class PadGradFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    pad_.channelStart = config.get<int>("cstart");
+    pad_.channelEnd = config.get<int>("cend");
+    pad_.heightStart = config.get<int>("hstart");
+    pad_.heightEnd = config.get<int>("hend");
+    pad_.widthStart = config.get<int>("wstart");
+    pad_.widthEnd = config.get<int>("wend");
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
+
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
+                    inC,
+                    inH,
+                    inW,
+                    pad_);
+  }
+
+private:
+  PadConf pad_;
+};
+
+REGISTER_TYPED_FUNC(Pad, CPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, CPU, PadGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(Pad, GPU, PadFunc);
+REGISTER_TYPED_FUNC(PadGrad, GPU, PadGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/PadOp.h b/paddle/function/PadOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b5c730a6a0fa57833e63beba085cb17054ae2f5
--- /dev/null
+++ b/paddle/function/PadOp.h
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+struct PadConf {
+  /// how many values to add before the data along channel dimension.
+  int channelStart;
+  /// how many values to add after the data along channel dimension.
+  int channelEnd;
+  /// how many values to add before the data along height dimension.
+  int heightStart;
+  /// how many values to add after the data along height dimension.
+  int heightEnd;
+  /// how many values to add before the data along width dimension.
+  int widthStart;
+  /// how many values to add after the data along width dimension.
+  int widthEnd;
+};
+
+/**
+ * \brief  This funtion pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according argument of pad.
+ *
+ * \param[out] outputs save results.
+ * \param[in]  inputs  input data.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
+ */
+template <DeviceType Device>
+void Pad(real* outputs,
+         const real* inputs,
+         const int num,
+         const int inC,
+         const int inH,
+         const int inW,
+         const PadConf& pad);
+
+/**
+ * \brief   Padding operation backward.
+ *
+ * \param[out] inGrad  gradients of previous layer.
+ * \param[in]  outGrad output gradients.
+ * \param[in]  num     batch size of input data.
+ * \param[in]  inC     channel number of input data.
+ * \param[in]  inH     height of input data.
+ * \param[in]  inH     with of input data.
+ * \param[in]  pad     the padding config, contains the size along the
+ *                     specify dimension.
+ */
+template <DeviceType Device>
+void PadGrad(real* inGrad,
+             const real* outGrad,
+             const int num,
+             const int inC,
+             const int inH,
+             const int inW,
+             const PadConf& pad);
+}  // namespace paddle
diff --git a/paddle/function/PadOpGpu.cu b/paddle/function/PadOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9104b1aca507c526858c2117e0a5db59f535091e
--- /dev/null
+++ b/paddle/function/PadOpGpu.cu
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "PadOp.h"
+
+namespace paddle {
+
+__global__ void KePad(real* outputs, const real* inputs,
+                      int inC, int inH, int inW,
+                      int padc, int padh, int padw,
+                      int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    outputs[off] = inputs[idx];
+  }
+}
+
+template <>
+void Pad<DEVICE_TYPE_GPU>(real* outputs,
+                          const real* inputs,
+                          const int num,
+                          const int inC,
+                          const int inH,
+                          const int inW,
+                          const PadConf& pad) {
+  size_t nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  KePad<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (outputs, inputs, inC, inH, inW, cstart, hstart, wstart,
+     outC, outH, outW, nth);
+  CHECK_SYNC("Pad");
+}
+
+__global__ void KePadDiff(real* inGrad, const real* outGrad,
+                          int inC, int inH, int inW,
+                          int padc, int padh, int padw,
+                          int outC, int outH, int outW, int nthreads) {
+  const int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < nthreads) {
+    const int w = idx % inW;
+    const int h = (idx / inW) % inH;
+    const int c = (idx / inW / inH) % inC;
+    const int n = idx / inW / inH / inC;
+
+    const int off = ((n * outC + c + padc) * outH + h + padh) * outW + padw + w;
+    inGrad[idx] += outGrad[off];
+  }
+}
+
+template <>
+void PadGrad<DEVICE_TYPE_GPU>(real* inGrad,
+                              const real* outGrad,
+                              const int num,
+                              const int inC,
+                              const int inH,
+                              const int inW,
+                              const PadConf& pad) {
+  int nth = num * inC * inH * inW;
+  int blockSize = 1024;
+  int gridSize = (nth + 1024 - 1) / 1024;
+  int cstart = pad.channelStart, cend = pad.channelEnd;
+  int hstart = pad.heightStart, hend = pad.heightEnd;
+  int wstart = pad.widthStart, wend = pad.widthEnd;
+  int outC = inC + cstart + cend;
+  int outH = inH + hstart + hend;
+  int outW = inW + wstart + wend;
+  KePadDiff <<<gridSize, blockSize, 0, STREAM_DEFAULT>>>
+    (inGrad, outGrad, inC, inH, inW, cstart, hstart, wstart,
+     outC, outH, outW, nth);
+  CHECK_SYNC("PadGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd22d9113567912f7694e05e5d631e49d940e3ac
--- /dev/null
+++ b/paddle/function/PadOpTest.cpp
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(Pad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+
+          FunctionCompare compare("Pad",
+                                  FuncConfig()
+                                      .set("cstart", 2)
+                                      .set("cend", 3)
+                                      .set("hstart", 1)
+                                      .set("hend", 2)
+                                      .set("wstart", 3)
+                                      .set("wend", 2));
+          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+          TensorShape outDims{
+              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
+          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
+          compare.run();
+        }
+      }
+    }
+  }
+}
+
+TEST(PadGrad, real) {
+  for (size_t numSamples : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t imgSizeH : {5, 33, 100}) {
+        for (size_t imgSizeW : {5, 32, 96}) {
+          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
+                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
+          FunctionCompare compare("PadGrad",
+                                  FuncConfig()
+                                      .set("cstart", 2)
+                                      .set("cend", 3)
+                                      .set("hstart", 1)
+                                      .set("hend", 2)
+                                      .set("wstart", 3)
+                                      .set("wend", 2));
+          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+          TensorShape outDims{
+              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
+          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
+          compare.run();
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda58f19dfa4a8b80efc97570c83ca38fd7adf27
--- /dev/null
+++ b/paddle/function/TensorShape.h
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+
+namespace paddle {
+
+/**
+ * TensorShape used to represent shape of normal tensor.
+ */
+class TensorShape {
+public:
+  TensorShape() : ndims_(0), nelements_(0) { initDims(0); }
+
+  TensorShape(size_t ndims) : ndims_(ndims), nelements_(1) { initDims(ndims); };
+
+  TensorShape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    initDims(ndims_);
+    dims_.assign(dims);
+    numElements();
+  };
+
+  TensorShape(const TensorShape& t)
+      : ndims_(t.ndims_), nelements_(t.nelements_) {
+    initDims(ndims_);
+    dims_.assign(t.dims_.begin(), t.dims_.end());
+  };
+
+  // get the size of specified dimension
+  size_t operator[](size_t dim) const {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    return dims_[dim];
+  }
+
+  // set the size of specified dimension
+  void setDim(size_t dim, size_t size) {
+    CHECK_GE(dim, (size_t)0);
+    CHECK_LT(dim, ndims_);
+    dims_[dim] = size;
+    numElements();
+  }
+
+  void reshape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    if (ndims_ > kMinDims) {
+      dims_.resize(ndims_);
+    }
+    dims_.assign(dims);
+    numElements();
+  }
+
+  // number of dimensions of the tensor
+  size_t ndims() const { return ndims_; }
+
+  size_t getElements() const { return nelements_; }
+
+  bool operator==(const TensorShape& t) const {
+    if (ndims() != t.ndims()) return false;
+    for (size_t i = 0; i < ndims(); i++) {
+      if (dims_[i] != t.dims_[i]) return false;
+    }
+
+    return true;
+  }
+
+  bool operator!=(const TensorShape& t) const { return !(*this == t); }
+
+private:
+  // compute number of elements
+  void numElements() {
+    nelements_ = 1;
+    for (size_t n = 0; n < ndims_; n++) {
+      nelements_ *= dims_[n];
+    }
+  }
+
+  // init dims_
+  void initDims(size_t ndims) {
+    size_t count = ndims < kMinDims ? kMinDims : ndims;
+    dims_.assign(count, 1);
+  }
+
+  // number of dimensions
+  // ndims_ may be not equeal dims_.size()
+  size_t ndims_;
+  // number of elements
+  size_t nelements_;
+  std::vector<size_t> dims_;
+  static const size_t kMinDims = 4;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorShapeTest.cpp b/paddle/function/TensorShapeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..45a2e106e7fc3f0e9e57cf8c2bb549d747f4f49b
--- /dev/null
+++ b/paddle/function/TensorShapeTest.cpp
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorShape.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorShape, Constructor) {
+  TensorShape t1;
+  EXPECT_EQ(t1.ndims(), 0);
+  EXPECT_EQ(t1.getElements(), 0);
+
+  TensorShape t2(3);
+  EXPECT_EQ(t2.ndims(), 3);
+  EXPECT_EQ(t2.getElements(), 1);
+
+  TensorShape t3({8, 10});
+  EXPECT_EQ(t3.ndims(), 2);
+  EXPECT_EQ(t3.getElements(), 80);
+
+  TensorShape t4(t3);
+  EXPECT_EQ(t4.ndims(), t3.ndims());
+  EXPECT_EQ(t4.getElements(), t3.getElements());
+
+  TensorShape t5({1, 2, 3, 4, 5});
+  EXPECT_EQ(t5.ndims(), 5);
+  EXPECT_EQ(t5.getElements(), 120);
+}
+
+TEST(TensorShape, GetAndSet) {
+  TensorShape t({1, 2, 3});
+  EXPECT_EQ(t.ndims(), 3);
+  EXPECT_EQ(t.getElements(), 6);
+
+  EXPECT_EQ(t[1], 2);
+  t.setDim(1, 100);
+  EXPECT_EQ(t.getElements(), 300);
+  EXPECT_EQ(t[1], 100);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/TensorType.h b/paddle/function/TensorType.h
new file mode 100644
index 0000000000000000000000000000000000000000..8308bbd8ad4fe1b97b35b779f27d2bf4534f0fa6
--- /dev/null
+++ b/paddle/function/TensorType.h
@@ -0,0 +1,149 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+enum ValueType {
+  VALUE_TYPE_INT32 = 0,
+  VALUE_TYPE_FLOAT = 1,
+  VALUE_TYPE_DOUBLE = 2,
+  VALUE_TYPE_BYTE = 3
+};
+
+enum DeviceType {
+  DEVICE_TYPE_UNSPECIFIED = 0,
+  DEVICE_TYPE_CPU = 1,
+  DEVICE_TYPE_GPU = 2
+};
+
+enum SparseDataType { T_NO_VALUE = 0, T_FLOAT_VALUE = 1 };
+
+enum SparseDataFormat { T_SPARSE_CSR = 0, T_SPARSE_CSC = 1 };
+
+inline int sizeOfValuType(ValueType valueType) {
+  if (valueType == VALUE_TYPE_INT32) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_FLOAT) {
+    return 4;
+  } else if (valueType == VALUE_TYPE_DOUBLE) {
+    return 8;
+  } else {
+    LOG(FATAL) << "Unknown type: " << valueType;
+    return 0;
+  }
+}
+
+template <typename T>
+struct DataType;
+
+template <>
+struct DataType<float> {
+  static const ValueType value = VALUE_TYPE_FLOAT;
+};
+
+template <>
+struct DataType<double> {
+  static const ValueType value = VALUE_TYPE_DOUBLE;
+};
+
+template <>
+struct DataType<int> {
+  static const ValueType value = VALUE_TYPE_INT32;
+};
+
+namespace detail {
+
+template <typename VType, DeviceType Device>
+struct MatrixT;
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuMatrix;
+};
+
+template <>
+struct MatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuMatrix;
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct MatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct SparseMatrixT;
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_CPU> {
+  using type = CpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<real, DEVICE_TYPE_GPU> {
+  using type = GpuSparseMatrix;
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_CPU> {
+  using type = void;  // Not implemented
+};
+
+template <>
+struct SparseMatrixT<int, DEVICE_TYPE_GPU> {
+  using type = void;  // Not implemented
+};
+
+template <typename VType, DeviceType Device>
+struct VectorT;
+
+template <>
+struct VectorT<real, DEVICE_TYPE_CPU> {
+  using type = CpuVector;
+};
+
+template <>
+struct VectorT<real, DEVICE_TYPE_GPU> {
+  using type = GpuVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct VectorT<int, DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
+}  // namespace detail
+
+template <typename VType, DeviceType DType>
+struct Tensor {
+  typedef typename detail::VectorT<VType, DType>::type Vector;
+  typedef typename detail::MatrixT<VType, DType>::type Matrix;
+  typedef typename detail::SparseMatrixT<VType, DType>::type SparseMatrix;
+};
+
+}  // namespace paddle
diff --git a/paddle/function/TensorTypeTest.cpp b/paddle/function/TensorTypeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e50e46f3e99111731d9587f3e4ddfd4b26ae27e9
--- /dev/null
+++ b/paddle/function/TensorTypeTest.cpp
@@ -0,0 +1,64 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "TensorType.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+
+TEST(TensorType, Matrix) {
+  Tensor<real, DEVICE_TYPE_CPU>::Matrix matrix(100, 200);
+  EXPECT_EQ(matrix.getHeight(), 100);
+  EXPECT_EQ(matrix.getWidth(), 200);
+  EXPECT_EQ(matrix.getElementCnt(), 100 * 200);
+  EXPECT_EQ(matrix.useGpu(), false);
+
+  Tensor<real, DEVICE_TYPE_GPU>::Matrix testGpu(100, 200);
+  EXPECT_EQ(testGpu.useGpu(), true);
+}
+
+TEST(TensorType, Vector) {
+  Tensor<real, DEVICE_TYPE_CPU>::Vector cpuVector(100);
+  Tensor<real, DEVICE_TYPE_GPU>::Vector gpuVector(100);
+  EXPECT_EQ(cpuVector.useGpu(), false);
+  EXPECT_EQ(gpuVector.useGpu(), true);
+  EXPECT_EQ(cpuVector.getSize(), 100);
+  EXPECT_EQ(gpuVector.getSize(), 100);
+
+  Tensor<int, DEVICE_TYPE_CPU>::Vector cpuIVector(100);
+  Tensor<int, DEVICE_TYPE_GPU>::Vector gpuIVector(100);
+  EXPECT_EQ(cpuIVector.useGpu(), false);
+  EXPECT_EQ(gpuIVector.useGpu(), true);
+  EXPECT_EQ(cpuIVector.getSize(), 100);
+  EXPECT_EQ(gpuIVector.getSize(), 100);
+}
+
+TEST(TensorType, EmptyMatrix) {
+  CpuMatrix empty(nullptr, 0, 0);
+  CpuMatrix nonEmpty(10, 10);
+  EXPECT_EQ(empty.isEmpty(), true);
+  EXPECT_EQ(nonEmpty.isEmpty(), false);
+  CHECK(nonEmpty);
+  auto function = [](const CpuMatrix& matrix) {
+    if (matrix) {
+      EXPECT_NE(matrix.getData(), nullptr);
+    } else {
+      EXPECT_EQ(matrix.getData(), nullptr);
+    }
+  };
+  function(empty);
+  function(nonEmpty);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/cross_map_normal_op.cpp b/paddle/function/cross_map_normal_op.cpp
deleted file mode 100644
index a9c7693830542f0e0d852f629d210b92a5bf2069..0000000000000000000000000000000000000000
--- a/paddle/function/cross_map_normal_op.cpp
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "cross_map_normal_op.h"
-#include "paddle/math/Vector.h"
-
-namespace paddle {
-
-template <>
-void CrossMapNormal<DEVICE_TYPE_CPU>(real* outputs,
-                                     real* denoms,
-                                     const real* inputs,
-                                     size_t numSamples,
-                                     size_t channels,
-                                     size_t height,
-                                     size_t width,
-                                     size_t size,
-                                     real scale,
-                                     real pow) {
-  size_t oneImage = height * width;
-  size_t oneSample = channels * oneImage;
-
-  CpuVector outputsV(numSamples * oneSample, outputs);
-  CpuVector inputsV(numSamples * oneSample, const_cast<real*>(inputs));
-  CpuVector denomsV(numSamples * oneSample, denoms);
-
-  // f(x) = x * ( 1 + scale * SUM((x)^2) )^(-pow)
-  // x represents inputs
-  // f(x) represents outputs
-  // denoms save the intermediate result for backward
-  denomsV = denomsV.constant(1.0);
-  const int start = -((int)size - 1) / 2;
-  const int end = (int)size + start;
-  for (size_t i = 0; i < numSamples; i++) {
-    real* oneDenom = denoms + i * oneSample;
-    real* oneInput = const_cast<real*>(inputs) + i * oneSample;
-    for (int c = 0; c < (int)channels; c++) {
-      CpuVector denom(oneImage, oneDenom + c * oneImage);
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          CpuVector input(oneImage, oneInput + (c + s) * oneImage);
-          denom += input.square() * scale;
-        }
-      }
-    }
-  }
-
-  outputsV = inputsV * denomsV.pow(-pow);
-}
-
-template <>
-void CrossMapNormalGrad<DEVICE_TYPE_CPU>(real* inputsGrad,
-                                         const real* inputsValue,
-                                         const real* outputsValue,
-                                         const real* outputsGrad,
-                                         const real* denoms,
-                                         size_t numSamples,
-                                         size_t channels,
-                                         size_t height,
-                                         size_t width,
-                                         size_t size,
-                                         real scale,
-                                         real pow) {
-  size_t oneSample = channels * height * width;
-  std::function<CpuVector(real*, size_t)> oneImage = [=](real* data,
-                                                         size_t offset) {
-    return CpuVector(height * width, data + offset);
-  };
-
-  const int start = -((int)size) / 2;
-  const int end = (int)size + start;
-  const real ratio = -(real)2 * scale * pow;
-  for (size_t i = 0; i < numSamples; i++) {
-    size_t sOffset = i * oneSample;
-    real* oneInputGrad = inputsGrad + sOffset;
-    real* oneInputValue = const_cast<real*>(inputsValue) + sOffset;
-    real* oneDenom = const_cast<real*>(denoms) + sOffset;
-    real* oneOutputGrad = const_cast<real*>(outputsGrad) + sOffset;
-    real* oneOutputValue = const_cast<real*>(outputsValue) + sOffset;
-
-    for (int c = 0; c < (int)channels; c++) {
-      size_t cOffset = c * height * width;
-      CpuVector inputGrad = oneImage(oneInputGrad, cOffset);
-      CpuVector inputValue = oneImage(oneInputValue, cOffset);
-      CpuVector denom = oneImage(oneDenom, cOffset);
-      CpuVector outputGrad = oneImage(oneOutputGrad, cOffset);
-
-      inputGrad = inputGrad + denom.pow(-pow) * outputGrad;
-      for (int s = start; s < end; s++) {
-        if (c + s >= 0 && c + s < (int)channels) {
-          size_t offset = (c + s) * height * width;
-          CpuVector output = oneImage(oneOutputValue, offset);
-          CpuVector outputGrad = oneImage(oneOutputGrad, offset);
-          CpuVector denom = oneImage(oneDenom, offset);
-
-          inputGrad += ((outputGrad * output * ratio) / denom) * inputValue;
-        }
-      }
-    }
-  }
-}
-
-/**
- * \param inputs[0] input value.
- * \param outputs[0] output value.
- * \param outputs[1] denoms.
- */
-template <DeviceType Device>
-class CrossMapNormalFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-  }
-
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(1, inputs.size());
-    CHECK_EQ(2, outputs.size());
-    CHECK_EQ(0, inouts.size());
-
-    CHECK_EQ(inputs[0].dims_.size(), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
-    }
-
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
-
-    CrossMapNormal<Device>(outputs[0].getData(),
-                           outputs[1].getData(),
-                           inputs[0].getData(),
-                           samples,
-                           channels,
-                           height,
-                           width,
-                           size_,
-                           scale_,
-                           pow_);
-  }
-
-private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-/**
- * \param inputs[0] input value.
- * \param inputs[1] output value.
- * \param inputs[2] output grad.
- * \param inputs[3] denoms.
- * \param outputs[0] input grad.
- */
-template <DeviceType Device>
-class CrossMapNormalGradFunc : public FunctionBase {
-public:
-  void init(const FuncConfig& config) override {
-    size_ = config.get<size_t>("size");
-    scale_ = config.get<real>("scale");
-    pow_ = config.get<real>("pow");
-  }
-
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(4, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
-
-    CHECK_EQ(inputs[0].dims_.size(), 4);
-    for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
-      CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], inputs[3].dims_[i]);
-      CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
-    }
-
-    size_t samples = inputs[0].dims_[0];
-    size_t channels = inputs[0].dims_[1];
-    size_t height = inputs[0].dims_[2];
-    size_t width = inputs[0].dims_[3];
-
-    CrossMapNormalGrad<Device>(outputs[0].getData(),
-                               inputs[0].getData(),
-                               inputs[1].getData(),
-                               inputs[2].getData(),
-                               inputs[3].getData(),
-                               samples,
-                               channels,
-                               height,
-                               width,
-                               size_,
-                               scale_,
-                               pow_);
-  }
-
-private:
-  size_t size_;
-  real scale_;
-  real pow_;
-};
-
-REGISTER_TYPED_FUNC(CrossMapNormal, CPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, CPU, CrossMapNormalGradFunc);
-#ifndef PADDLE_ONLY_CPU
-REGISTER_TYPED_FUNC(CrossMapNormal, GPU, CrossMapNormalFunc);
-REGISTER_TYPED_FUNC(CrossMapNormalGrad, GPU, CrossMapNormalGradFunc);
-#endif
-
-}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f8c4bcac2f8eb41400659dc24ba81768e7ae3640..c541b72e104bf2b81e2ac222d4af13ea2f90d289 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -69,8 +69,14 @@ static ClassRegistrar<ActivationFunction> gActivationRegistrar;
 class IdentityActivation : public ActivationFunction {
 public:
   static const std::string name;
-  void forward(Argument& act) { (void)act; }
-  void backward(Argument& act) { (void)act; }
+  Error __must_check forward(Argument& act) {
+    (void)act;
+    return Error();
+  }
+  Error __must_check backward(Argument& act) {
+    (void)act;
+    return Error();
+  }
   const std::string& getName() const { return name; }
 };
 const std::string IdentityActivation::name = "";
@@ -86,8 +92,14 @@ static InitFunction __reg_activation__identity([] {
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(sigmoid)
-void forward(Argument& act) { act.value->sigmoid(*act.value); }
-void backward(Argument& act) { act.grad->sigmoidDerivative(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->sigmoid(*act.value);
+  return Error();
+}
+Error __must_check backward(Argument& act) {
+  act.grad->sigmoidDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(sigmoid)
 
 /**
@@ -103,9 +115,12 @@ MatrixPtr sftMaxDot_;
 MatrixPtr one_;
 
 public:
-void forward(Argument& act) { act.value->softmax(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->softmax(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   MatrixPtr outputV = act.value;
   MatrixPtr outputG = act.grad;
 
@@ -137,6 +152,7 @@ void backward(Argument& act) {
 
     act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
   }
+  return Error();
 }
 END_DEFINE_ACTIVATION(softmax)
 
@@ -151,8 +167,11 @@ ACTIVATION_CLASS_NAME(softmax) softmax_;
 Argument argument_;
 
 public:
-void forward(Argument& act) {
-  CHECK_EQ(act.value->getWidth(), 1UL);
+Error __must_check forward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   if (!argument_.value) {
     argument_.value = Matrix::create(nullptr,
@@ -169,10 +188,14 @@ void forward(Argument& act) {
 
   auto starts = act.sequenceStartPositions->getVector(useGpu(act.deviceId));
   act.value->sequenceSoftmax(*act.value, *starts);
+  return Error();
 }
 
-void backward(Argument& act) {
-  CHECK_EQ(act.grad->getWidth(), 1UL);
+Error __must_check backward(Argument& act) {
+  if (act.value->getWidth() != 1UL) {
+    return Error(
+        "Input width for each timestep of sequence softmax should be 1");
+  }
 
   size_t numSequences = act.getNumSequences();
   const int* starts = act.sequenceStartPositions->getData(false);
@@ -184,8 +207,10 @@ void backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    softmax_.backward(argument_);
+    Error status = softmax_.backward(argument_);
+    if (!status) return status;
   }
+  return Error();
 }
 END_DEFINE_ACTIVATION(sequence_softmax)
 
@@ -200,9 +225,15 @@ END_DEFINE_ACTIVATION(sequence_softmax)
  *    0 otherwise.
  */
 BEGIN_DEFINE_ACTIVATION(relu)
-void forward(Argument& act) { act.value->relu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->relu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->reluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->reluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(relu)
 
 /**
@@ -219,9 +250,15 @@ END_DEFINE_ACTIVATION(relu)
  * TODO(yuyang18): Remove magic number 24 or make it configuable.
  */
 BEGIN_DEFINE_ACTIVATION(brelu)
-void forward(Argument& act) { act.value->brelu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->brelu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->breluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->breluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(brelu)
 
 /**
@@ -231,9 +268,15 @@ END_DEFINE_ACTIVATION(brelu)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(tanh)
-void forward(Argument& act) { act.value->tanh(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->tanh(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->tanhDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->tanhDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(tanh)
 
 /**
@@ -248,10 +291,14 @@ real a, b;
 
 public:
 ACTIVATION_CLASS_NAME(stanh)() : a(1.7159), b(2. / 3.) {}
-void forward(Argument& act) { act.value->scaledTanh(*act.value, a, b); }
+Error __must_check forward(Argument& act) {
+  act.value->scaledTanh(*act.value, a, b);
+  return Error();
+}
 
-void backward(Argument& act) {
+Error __must_check backward(Argument& act) {
   act.grad->scaledTanhDerivative(*act.value, a, b);
+  return Error();
 }
 END_DEFINE_ACTIVATION(stanh)
 
@@ -262,9 +309,15 @@ END_DEFINE_ACTIVATION(stanh)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(softrelu)
-void forward(Argument& act) { act.value->softrelu(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->softrelu(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->softreluDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->softreluDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(softrelu)
 
 /**
@@ -280,7 +333,7 @@ END_DEFINE_ACTIVATION(softrelu)
  *     0   if z=0
  */
 BEGIN_DEFINE_ACTIVATION(abs)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -290,9 +343,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->abs2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->absDerivative(*act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->absDerivative(*act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(abs)
 
 /**
@@ -302,7 +359,7 @@ END_DEFINE_ACTIVATION(abs)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(square)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -312,9 +369,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->square2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->squareDerivative(*act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(square)
 
 /**
@@ -324,9 +385,15 @@ END_DEFINE_ACTIVATION(square)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp2(*act.value); }
+Error __must_check forward(Argument& act) {
+  act.value->exp2(*act.value);
+  return Error();
+}
 
-void backward(Argument& act) { act.grad->expDerivative(*act.value); }
+Error __must_check backward(Argument& act) {
+  act.grad->expDerivative(*act.value);
+  return Error();
+}
 END_DEFINE_ACTIVATION(exponential)
 
 /**
@@ -336,7 +403,7 @@ END_DEFINE_ACTIVATION(exponential)
  * \f]
  */
 BEGIN_DEFINE_ACTIVATION(log)
-void forward(Argument& act) {
+Error __must_check forward(Argument& act) {
   SetDevice device(act.deviceId);
   Matrix::resizeOrCreate(act.in,
                          act.value->getHeight(),
@@ -346,9 +413,13 @@ void forward(Argument& act) {
 
   act.in->copyFrom(*act.value);
   act.value->log2(*act.value);
+  return Error();
 }
 
-void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+Error __must_check backward(Argument& act) {
+  act.grad->dotDiv(*act.grad, *act.in);
+  return Error();
+}
 END_DEFINE_ACTIVATION(log)
 
 ActivationFunction* ActivationFunction::create(const std::string& type) {
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 601e3b6c0cd401ec007e8cf51e44416f82832e58..f208224e304a79125679c6f3a5c0be09552465ef 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -48,7 +49,7 @@ public:
    *
    * Usually, act is Layer::output_
    */
-  virtual void forward(Argument& act) = 0;
+  virtual Error __must_check forward(Argument& act) = 0;
 
   /**
    * @brief Backward propagaion
@@ -57,7 +58,7 @@ public:
    * - Before calling backward(), act.grad = dE / dy, where E is the error/cost
    * - After backward() returns, act.grad = dE / dx = (dE/dy) * (dy/dx)
    */
-  virtual void backward(Argument& act) = 0;
+  virtual Error __must_check backward(Argument& act) = 0;
 
   virtual const std::string& getName() const = 0;
 };
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 5f031fc7c0761a8fe97eb16fe1dd8e0a1debfcdb..9a2ad7567f0dc93d0a8e396fd88b2488afe9d049 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -30,12 +30,12 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 5bdd55309c8bf8d5dcf84f5dcef2c5c85249a668..b53790e764b9f9ad668abd1f4125695e3533a027 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include <fenv.h>
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index c26e242534f2afcff396762adb085bf99303e2b5..b8079dc0796d0e300e65ac6b6b8d3bc826b1e504 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -647,7 +647,7 @@ public:
       DataBatch& gpuBatch = *batch;
       std::vector<Argument>& gpuArguments = gpuBatch.getStreams();
       gpuArguments.resize(cpuArguments.size());
-      gpuBatch.setSize(size);
+      gpuBatch.setSize(bsize);
       for (size_t i = 0; i < headers_.size(); ++i) {
         gpuArguments[i].resizeAndCopyFrom(
             cpuArguments[i], useGpu_, HPPL_STREAM_1);
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 05aa6c012ae2bc0afcbaf23f8ff78b3c782d050c..132119015f967c6e8d055792de8afe8450df5ec6 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * calculate sequence-to-sequence edit distance
  */
-class CTCErrorEvaluator : public Evaluator {
+class CTCErrorEvaluator : public NotGetableEvaluator {
 private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index ae7508e2bb117a60492e0c28230f2fbb4b14915e..9db6d252d97bfeee3fe376bcda431fe94c65a678 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/utils/Stat.h"
-
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DECLARE_int32(trainer_id);
 
@@ -39,6 +39,14 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  */
 class ClassificationErrorEvaluator : public Evaluator {
 public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     if (3 == arguments.size()) {
       numSamples_ += arguments[2].value->getSum();
@@ -76,9 +84,11 @@ public:
                                               1,
                                               /* trans= */ false,
                                               useGpu(arguments[0].deviceId));
+
     errorMat->zeroMem();
+
     if (label != nullptr) {
-      errorMat->classificationError(*output, *label);
+      errorMat->classificationError(*output, *label, config_.top_k());
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
       errorMat->classificationErrorMulti(
@@ -94,6 +104,16 @@ public:
     return errorMat;
   }
 
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
   virtual real evalImp(std::vector<Argument>& arguments) {
     MatrixPtr errorMat = calcError(arguments);
     return errorMat->getSum();
@@ -102,6 +122,10 @@ public:
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "classification_error"; }
 };
 
 /**
@@ -140,6 +164,10 @@ public:
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
                    SequenceClassificationErrorEvaluator);
@@ -230,6 +258,10 @@ public:
 private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "sum"; }
 };
 /**
  * @brief column sum Evaluator
@@ -337,10 +369,18 @@ public:
   }
 
 private:
-  ColumnSumEvaluator() {}
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
 };
 
 void AucEvaluator::start() {
@@ -449,6 +489,16 @@ double AucEvaluator::calcAuc() const {
   }
 }
 
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
 // class RankAucEvaluator
 REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
 
@@ -528,12 +578,15 @@ double RankAucEvaluator::calcRankAuc(real* outputData,
                                         : aucTmp / (clickSum * noClickSum);
 }
 
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
 // class PrecisionRecallEvaluator
 REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
 
 void PrecisionRecallEvaluator::start() {
   Evaluator::start();
   statsInfo_.clear();
+  values_.clear();
 }
 
 real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
@@ -594,52 +647,23 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
 }
 
 void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    double precision =
-        calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    double recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    os << "positive_label=" << label << " precision=" << precision
-       << " recall=" << recall
-       << " F1-score=" << calcF1Score(precision, recall);
-    return;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  double macroAvgPrecision = 0;
-  double macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    macroAvgPrecision += calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  macroAvgPrecision /= numLabels;
-  macroAvgRecall /= numLabels;
-  double macroAvgF1Score = calcF1Score(macroAvgPrecision, macroAvgRecall);
-  os << "macro-average-precision=" << macroAvgPrecision
-     << " macro-average-recall=" << macroAvgRecall
-     << " macro-average-F1-score=" << macroAvgF1Score;
-
-  double microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  double microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  double microAvgF1Score = calcF1Score(microAvgPrecision, microAvgRecall);
-  if (!isMultiBinaryLabel_) {
-    // precision and recall are equal in this case
-    os << " micro-average-precision=" << microAvgPrecision;
-  } else {
-    os << " micro-average-precision=" << microAvgPrecision
-       << " micro-average-recall=" << microAvgRecall
-       << " micro-average-F1-score=" << microAvgF1Score;
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
   }
 }
 
@@ -721,6 +745,60 @@ void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
   }
 }
 
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
 void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   size_t size = 4 * statsInfo_.size();
   double* buf = new double[size];
@@ -740,6 +818,47 @@ void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   delete[] buf;
 }
 
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
 REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
 void PnpairEvaluator::start() {
   Evaluator::start();
@@ -864,56 +983,35 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
             << " calc total special pair: " << special;
 }
 
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
 ClassRegistrar<Evaluator> Evaluator::registrar_;
 Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = nullptr;
-  if (config.type() == "classification_error") {
-    evaluator = new ClassificationErrorEvaluator();
-  } else if (config.type() == "sum") {
-    evaluator = new SumEvaluator();
-  } else if (config.type() == "last-column-sum") {
-    evaluator = new ColumnSumEvaluator(-1);
-  } else if (config.type() == "last-column-auc") {
-    evaluator = new AucEvaluator(-1);
-  } else {
-    evaluator = registrar_.createByType(config.type());
-  }
+  Evaluator* evaluator = registrar_.createByType(config.type());
   evaluator->init(config);
   return evaluator;
 }
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
 /**
  * @brief print value of each layer.
  *
  * The config file api is value_printer_evaluator.
  */
-class ValuePrinter : public Evaluator {
+class ValuePrinter : public NotGetableEvaluator {
 public:
-  ValuePrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        std::ostringstream os;
-        argu.value->print(os);
-        LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-      }
-      if (argu.ids) {
-        std::ostringstream os;
-        argu.ids->print(os, argu.ids->getSize());
-        LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-      }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
-      if (auto subStartPos = argu.subSequenceStartPositions) {
-        std::ostringstream os;
-        subStartPos->getVector(false)->print(os, subStartPos->getSize());
-        LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                  << os.str();
-      }
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
     }
   }
 
@@ -922,15 +1020,14 @@ public:
   virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
 };
 REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
 /**
  * @brief print gradient of each layer.
  *
  * The config file api is gradient_printer_evaluator.
  */
-class GradientPrinter : public Evaluator {
+class GradientPrinter : public NotGetableEvaluator {
 public:
-  GradientPrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -939,11 +1036,6 @@ public:
         argu.grad->print(os);
         LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
       }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
     }
   }
 
@@ -957,7 +1049,7 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  *
  * The config file api is maxid_printer_evaluator.
  */
-class MaxIdPrinter : public Evaluator {
+class MaxIdPrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -999,7 +1091,7 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  *
  * The config file api is maxframe_printer_evaluator.
  */
-class MaxFramePrinter : public Evaluator {
+class MaxFramePrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -1086,7 +1178,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  * The config file api is seqtext_printer_evaluator.
  *
  */
-class SequenceTextPrinter : public Evaluator {
+class SequenceTextPrinter : public NotGetableEvaluator {
 private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
@@ -1253,4 +1345,6 @@ public:
 };
 REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
 
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
 }  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index 5770847309670ef1856cfb9255fa847c24513b56..b114500e2b7c1e460a02c78b99b5f1a8fb63b8c3 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"
 #include "paddle/pserver/ParameterClient2.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -117,12 +118,105 @@ public:
 
   static ClassRegistrar<Evaluator> registrar_;
 
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
 protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
 };
 
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
 class DummyEvaluator : public Evaluator {
 public:
   DummyEvaluator() {}
@@ -135,6 +229,10 @@ public:
   }
   virtual void finish() {}
   virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -191,6 +289,11 @@ private:
   }
 
   double calcAuc() const;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
 };
 
 /**
@@ -223,6 +326,10 @@ private:
                      real* clickData,
                      real* pvData,
                      size_t size);
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief precision, recall and f1 score Evaluator
@@ -272,6 +379,20 @@ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
   void calcStatsInfo(const MatrixPtr& output,
                      const IVectorPtr& label,
                      const MatrixPtr& weight);
@@ -303,6 +424,15 @@ private:
       return 0;
     }
   }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names);
+  real getValue(const std::string& name, Error* err) const;
+  std::string getType(const std::string& name, Error* err) const;
 };
 
 /*
@@ -349,8 +479,7 @@ public:
   virtual void finish() { calc(predictArray_); }
 
   virtual void printStats(std::ostream& os) const {
-    os << " pos/neg"
-       << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+    os << " pos/neg=" << this->getValueImpl();
   }
 
   virtual void distributeEval(ParameterClient2* client) {
@@ -366,6 +495,13 @@ private:
   IVectorPtr cpuLabel_;
   IVectorPtr cpuInfo_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 36ca05b919b136c162105cf4f1fb7705ae7ca7f3..3eb87d9b85c8207a23046fdb4bda06ba8185e2a3 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -60,55 +60,6 @@ GradientMachine* GradientMachine::create(
   return nullptr;
 }
 
-GradientMachine* GradientMachine::create(const std::string& modelFile,
-                                         DataConfig* dataConfig) {
-  std::ifstream is(modelFile);
-  CHECK(is) << "Fail to open " << modelFile;
-  return create(is, dataConfig);
-}
-
-GradientMachine* GradientMachine::create(std::istream& is,
-                                         DataConfig* dataConfig) {
-  TrainerConfig trainerConfig;
-  GradientMachine* ret = create(is, &trainerConfig);
-  if (dataConfig && trainerConfig.has_data_config()) {
-    *dataConfig = trainerConfig.data_config();
-  }
-  return ret;
-}
-
-GradientMachine* GradientMachine::create(const std::string& modelFile,
-                                         TrainerConfig* trainerConfig) {
-  std::ifstream is(modelFile);
-  CHECK(is) << "Fail to open " << modelFile;
-  return create(is, trainerConfig);
-}
-
-GradientMachine* GradientMachine::create(std::istream& is,
-                                         TrainerConfig* trainerConfig) {
-  TrainerConfig trainerConfigTemp;
-  int64_t size;
-  CHECK(is.read((char*)&size, sizeof(size))) << "Fail to read ";
-  std::string buf;
-  buf.resize(size);
-  CHECK(is.read(&buf[0], size)) << "Fail to read ";
-  CHECK(trainerConfigTemp.ParseFromString(buf)) << "Fail to parse config";
-  std::unique_ptr<GradientMachine> machine(
-      create(trainerConfigTemp.model_config()));
-  std::vector<ParameterPtr>& parameters = machine->getParameters();
-  for (auto& para : parameters) {
-    para->load(is);
-  }
-
-  machine->onLoadParameter();
-
-  if (trainerConfig) {
-    *trainerConfig = trainerConfigTemp;
-  }
-
-  return machine.release();
-}
-
 void GradientMachine::saveParameters(const std::string& dir) const {
   LOG(INFO) << "Saving parameters to " << dir;
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 1e35c7e2b8d185e45f33f6287ad4e32ccad2d5a6..bc2f2f8563526aa045ea89f15152ee2d639b5774 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -89,39 +89,6 @@ public:
           std::vector<ParameterType>{
               PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM});
 
-  /**
-   * Create a gradient machine from the merged model file.
-   * The merged model file can be generated using tools/merge_model
-   * If dataConfig is not null, it will be filled with the DataConfig
-   * from the TrainerConfig
-   */
-  static GradientMachine* create(const std::string& modelFile,
-                                 DataConfig* dataConfig);
-
-  /**
-   * Create a gradient machine from a stream which contains the merged
-   * model file. The merged model file can be generated using tools/merge_model
-   * If dataConfig is not null, it will be filled with the DataConfig
-   * from the TrainerConfig
-   */
-  static GradientMachine* create(std::istream& is, DataConfig* dataConfig);
-
-  /**
-   * Create a gradient machine from the merged model file.
-   * The merged model file can be generated using tools/merge_model
-   * If trainerConfig is not null, it will be filled with the TrainerConfig
-   */
-  static GradientMachine* create(const std::string& modelFile,
-                                 TrainerConfig* trainerConfig);
-
-  /**
-   * Create a gradient machine from a stream which contains the merged
-   * model file. The merged model file can be generated using tools/merge_model
-   * If trainerConfig is not null, it will be filled with the TrainerConfig
-   */
-  static GradientMachine* create(std::istream& is,
-                                 TrainerConfig* trainerConfig);
-
   virtual ~GradientMachine() {}
 
   /**
@@ -167,6 +134,10 @@ public:
     backward(callback);
   }
 
+  virtual Argument getLayerOutput(const std::string& layerName) {
+    return *((Argument*)nullptr);
+  }
+
   // see comment in Layer.h for the function with the same name
   virtual void resetState() {}
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 80f223824d8dccfb0e9386f4c076b28f9332a958..123273f916f5d33e2543d9f5f28573c3b5761e28 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -282,6 +282,18 @@ void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
   backwardImp(callback);
 }
 
+Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
+  std::vector<Argument> args;
+  args.reserve(threads_.size());
+
+  for (auto& thread : threads_) {
+    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
+  }
+  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
+
+  return outLayerArgs_;
+}
+
 void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 9be15ef4bcf34f26b7eceb9047252e537f20a4a8..838a52b5153af63adbce5788824b9f541f22517c 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -189,6 +189,8 @@ public:
                        PassType passType,
                        const UpdateCallback& callback);
 
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   virtual void onPassEnd();
 
   virtual void finish();
@@ -314,6 +316,8 @@ protected:
   std::vector<Argument> outArgs_;
   hl_stream_t outArgStream_;
 
+  Argument outLayerArgs_;
+
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
   int numDevices_;         /* number of gpu devices */
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 22051e07ee0026bc3c44a8767e265a56b415b8e4..4512aacc81f86bf87fc9ea30adcf081327663f16 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -293,11 +293,10 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
-MatrixPtr NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  auto it = layerMap_.find(layerName);
-  CHECK(it != layerMap_.end()) << "Cannot find layer: " << layerName;
-  return it->second->getOutputValue();
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
 }
+
 void NeuralNetwork::onPassEnd() {
   for (auto& layer : layers_) {
     layer->onPassEnd();
@@ -306,7 +305,6 @@ void NeuralNetwork::onPassEnd() {
 
 class CombinedEvaluator : public Evaluator {
 public:
-  CombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -346,6 +344,55 @@ public:
 
 protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
 };
 
 Evaluator* NeuralNetwork::makeEvaluator() const {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 25af4abcf81700e200feea806fa3daed19df1275..e7b6c438407e7eab6eab1f6ed496f35caa9f2177 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -87,7 +87,8 @@ public:
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  MatrixPtr getLayerOutput(const std::string& layerName);
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   const LayerPtr& getLayer(const std::string& layerName) const {
     auto it = layerMap_.find(layerName);
     CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index a9a9f4f903e305bfe0ee3dd089a85ba524022faa..2ab964b8fc2e080282aa03db4ee6836540e666d7 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -155,7 +155,8 @@ protected:
 public:
   explicit BootBiasLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     if (!Layer::init(layerMap, parameterMap)) return false;
 
     if (biasParameter_) {
@@ -174,7 +175,7 @@ public:
     }
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     if (biases_) {
       MatrixPtr outV = getOutputValue();
       outV->addBias(*(biases_->getW()), 1);
@@ -182,7 +183,7 @@ public:
     }
   }
 
-  virtual void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     if (biases_) {
       backwardActivation();
       biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
diff --git a/paddle/gserver/layers/AddtoLayer.h b/paddle/gserver/layers/AddtoLayer.h
index 53d3f99cdd3439a1ba85f54526ca65005986c634..4e98c174b462763d3c2714770f66951981afa9f8 100644
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
@@ -44,19 +44,20 @@ public:
   /**
    * Intialization of AddtoLayer.
    */
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * Forward propagation.
    * @note There is no weight matrix for each input,
    *       because it just a simple add operation.
    */
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
   /**
    * Backward propagation.
    */
-  void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/AgentLayer.h b/paddle/gserver/layers/AgentLayer.h
index 41683ad6712d5df710737cf71c600790fcc8786f..b6dac7ae6fec2d61c60c9548d466233efe9febd5 100644
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -35,7 +35,8 @@ public:
 
   ~AgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   // if *numSamples* set,
   // real layer output will only use first *numSamples* rows
@@ -44,8 +45,8 @@ public:
     numSamples_ = numSamples;
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 /**
@@ -56,8 +57,8 @@ public:
   explicit SequenceAgentLayer(const LayerConfig& config) : AgentLayer(config) {}
   ~SequenceAgentLayer() {}
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 /**
@@ -78,7 +79,8 @@ public:
 
   virtual ~GatherAgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   // call before addRealLayer
   void copyIdAndSequenceInfo(const Argument& input,
@@ -88,8 +90,8 @@ public:
   // add one real layer, can call many times
   void addRealLayer(LayerPtr layer) { realLayers_.push_back(layer); }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 /**
@@ -133,7 +135,8 @@ public:
 
   virtual ~ScatterAgentLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * @brief set real layer in generation
@@ -182,8 +185,8 @@ public:
     numSequences_ = numSequences;
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 /**
diff --git a/paddle/gserver/layers/AverageLayer.h b/paddle/gserver/layers/AverageLayer.h
index b3c4ecec8bc6f56b4563ee9f1ada91e4d8f2cbb5..621e1d7bb12ec5b8c7a6173bd601835d9406e814 100644
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -38,12 +38,11 @@ public:
   explicit AverageLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
-  ~AverageLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   MatrixPtr outMtx_;
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index 75bda95de1472b08538b48072ddf9ea607b83299..230bafc31d96bbd49481a7ed135be6888688627e 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -52,7 +52,8 @@ public:
    */
   static Layer* create(const LayerConfig& config);
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * @brief Calculate feature map size. Some input uses frameHeight and
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 195acbbfc58db8368f6db1c1595dd6b04801ee26..f6115801fc6b341c0718f8851617de43bdeeec09 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -33,9 +33,10 @@ public:
 
   ~BatchNormalizationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   /// Epsilon value used in the batch normalization formula.
@@ -58,7 +59,7 @@ protected:
   /// to batch, channels* imagePixels.
   void shrinkMat(const MatrixPtr& in, MatrixPtr& out);
 
-  void onPassEnd() { firstTest_ = true; }
+  void onPassEnd() override { firstTest_ = true; }
 
   MatrixPtr tmpMat_, tmpGrad_;
   MatrixPtr expandedIn_, expandedOut_;
diff --git a/paddle/gserver/layers/BilinearInterpLayer.h b/paddle/gserver/layers/BilinearInterpLayer.h
index 4ff4b0ea793dc901d099bf73d55aa15463e62094..27c269f2781c99e4f166ef1052cbf03a773ad57e 100644
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
@@ -38,9 +38,10 @@ public:
   virtual ~BilinearInterpLayer() {}
 
   size_t getSize();
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index cc96fdd03fcac6925a16f0fb91045f065f74e803..8f347400e60ec84fc1b5fdbc1c911a8768b306d0 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -58,10 +58,11 @@ public:
 
   ~BlockExpandLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CRFDecodingLayer.h b/paddle/gserver/layers/CRFDecodingLayer.h
index 1fd444ad10e71df2bb6d8bdb839e6f02b33d647f..3cbcac6cf62decd43844cc442fc5e4f973d0acfc 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
@@ -32,9 +32,10 @@ namespace paddle {
 class CRFDecodingLayer : public CRFLayer {
 public:
   explicit CRFDecodingLayer(const LayerConfig& config) : CRFLayer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   std::unique_ptr<LinearChainCRF> crf_;
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index d21b32b68c1a40c814af3aa2c285612a5f938d79..de36a85083b6b293fd2d8522ec279a38cc4f8be3 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -29,9 +29,10 @@ namespace paddle {
 class CRFLayer : public Layer {
 public:
   explicit CRFLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   size_t numClasses_;
diff --git a/paddle/gserver/layers/CTCLayer.h b/paddle/gserver/layers/CTCLayer.h
index 70d429bad656ade3c05256472d799ae72e128be5..f7a515f312d075c54b4aab2557175c70fdbd9875 100644
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
@@ -22,10 +22,11 @@ namespace paddle {
 class CTCLayer : public Layer {
 public:
   explicit CTCLayer(const LayerConfig& config) : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
   void forwardImp(const Argument& softmaxSeqs, const Argument& labelSeqs);
-  virtual void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
   void backwardImp(const UpdateCallback& callback,
                    const Argument& softmaxSeqs,
                    const Argument& labelSeqs);
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index d19adace7d58af16736fc2b6e536f5fd69a19863..c5fc4cf4f81a55a4c57e92dce64c06acd404badd 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -28,10 +28,11 @@ public:
 
   ~ConcatenateLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(concat, ConcatenateLayer);
@@ -101,10 +102,11 @@ public:
 
   ~ConcatenateLayer2() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   std::vector<std::unique_ptr<Projection>> projections_;
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 51c0ae5cc9523debffa4bdfe44fe0df0c56839c2..d7042af1c25e7432e5b1efbb89cd8fd3f63fb4ae 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
     CHECK_EQ(inputDim * totalPad, parameter->getSize());
     weight_.reset(new Weight(totalPad, inputDim, parameter));
   }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
 }
 
 void ContextProjection::resetState() {
@@ -78,25 +104,30 @@ LayerStatePtr ContextProjection::getState() {
 }
 
 void ContextProjection::forward() {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
-
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  // size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), (size_t)1) << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  out_->value->contextProjectionForward(
-      *(in_->value),
-      state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
-      *startPositions,
-      config_.context_length(),
-      config_.context_start(),
-      beginPad_,
-      state_ ? true : isPadding);
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
+  forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -118,41 +149,33 @@ void ContextProjection::forward() {
 }
 
 void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value);
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(static_cast<int>(backward_.size()), 1)
+      << "Only one backward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  if (!out_->grad->useGpu()) {
-    out_->grad->contextProjectionBackward(
-        in_->grad.get(),
-        isPadding ? weight_->getWGrad().get() : nullptr,
-        *startPositions,
-        config_.context_length(),
-        config_.context_start(),
-        beginPad_,
-        isPadding);
-  } else {
-    if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(*(in_->grad),
-                                                *startPositions,
-                                                config_.context_length(),
-                                                config_.context_start());
-    }
-
-    if (isPadding && weight_->getWGrad()) {
-      out_->grad->contextProjectionBackwardWeight(
-          *(weight_->getWGrad()),
-          *startPositions,
-          config_.context_length(),
-          config_.context_start(),
-          weight_->getWGrad()->getHeight(),
-          beginPad_);
-    }
-  }
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 2df43bd04fec868924b5d45f9def231a48ee7f04..c87d6ed1d6d46b391ccf8722f6d110614be1fe78 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -61,6 +61,8 @@ public:
 
   virtual LayerStatePtr getState();
 
+  virtual bool init();
+
 protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index aedf4100e32fa1294c361b6163c14eab7869b803..e9d15d94f806a5d2e6f11cbbfc29e291dfe8538f 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -80,7 +80,8 @@ protected:
 public:
   explicit ConvBaseLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index e1c4b91ace21522a3bc640dfc4eaa1a42668ed02..0281170bc59855f6f4d2f4212523275a92d202d5 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -130,7 +130,8 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
   size_t width = calOutputSize();
   CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(channels_ * imageH_ * imageW_, in_->value->getWidth())
+  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
+           in_->value->getWidth())
       << "Wrong input size for convolution"
       << " channels=" << channels_ << " imageH=" << imageH_
       << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 9bfb1ab7a47b11a6793159aefcb4f9fa12b81a6b..002be415691f0b3df93835915dcbc9d455231422 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -47,10 +47,11 @@ public:
 
   ~ConvShiftLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(conv_shift, ConvShiftLayer);
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index ed57f2af3c6455fb89fd05b37bb205e8da0bf7e1..32eb3bf604acaa8f2060882b545efeeb40f8218d 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -49,10 +49,11 @@ public:
 
   ~ConvexCombinationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(convex_comb, ConvexCombinationLayer);
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index 254120443dc3d41bf2422be2e88cb376d70c93d4..57ba124e40cbd098fa8b0012ff31d6935b16862a 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -26,15 +26,23 @@ bool CosSimLayer::init(const LayerMap& layerMap,
   Layer::init(layerMap, parameterMap);
 
   CHECK_EQ(inputLayers_.size(), 2LU);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
   return true;
 }
 
 void CosSimLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   {
     REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
@@ -42,26 +50,43 @@ void CosSimLayer::forward(PassType passType) {
   }
 
   MatrixPtr outV = getOutputValue();
-
   /* activation */ {
     REGISTER_TIMER_INFO("CosFwAtvTimer", getName().c_str());
     MatrixPtr prevOut1 = getInputValue(0);
     MatrixPtr prevOut2 = getInputValue(1);
-    outV->cosSim(*prevOut1, *prevOut2, config_.cos_scale());
+
+    CHECK(outV && prevOut1 && prevOut2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*prevOut1);
+    inputs.addArg(*prevOut2);
+    outputs.addArg(*outV, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
   }
 }
 
 void CosSimLayer::backward(const UpdateCallback& callback) {
   /* activation */ {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    MatrixPtr outG = this->getOutputGrad();
-
-    outG->cosSimDerivative(*this->getOutputValue(),
-                           *getInputValue(0),
-                           *getInputValue(1),
-                           *getInputGrad(0),
-                           *getInputGrad(1),
-                           config_.cos_scale());
+    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
+
+    const auto outG = this->getOutputGrad();
+    const auto outV = this->getOutputValue();
+    const auto inV1 = this->getInputValue(0);
+    const auto inV2 = this->getInputValue(1);
+    auto inG1 = this->getInputGrad(0);
+    auto inG2 = this->getInputGrad(1);
+    CHECK(outG && outV && inV1 && inV2 && inG1 && inG2);
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*outG);
+    inputs.addArg(*outV);
+    inputs.addArg(*inV1);
+    inputs.addArg(*inV2);
+    outputs.addArg(*inG1, ADD_TO);
+    outputs.addArg(*inG2, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/CosSimLayer.h b/paddle/gserver/layers/CosSimLayer.h
index 5dcc5d8a5b4dc76cb6cea023a874049731a26516..8afaee62c2dcacba006846df0111fcbe8f7575e4 100644
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
@@ -28,7 +28,7 @@ namespace paddle {
  *
  * - Input1: A vector (batchSize * dataDim) *
  * - Input2: A vector (batchSize * dataDim) or (1 * dataDim) *
- * - Output: A vector (dataDim * 1)
+ * - Output: A vector (batchSize * 1)
  *
  * The config file api is cos_sim.
  */
@@ -38,10 +38,11 @@ public:
 
   ~CosSimLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index ad490b0b8c4656c1eabf519233f2386b4b6e9417..0f887d8adfa053e8fe88ac4fa4e2a9ba08ac07b5 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
-
 /**
  * @brief A layer for computing cosine similarity between a vector
  * and each row of a matrix
@@ -46,10 +45,11 @@ public:
 
   ~CosSimVecMatLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(cos_vm, CosSimVecMatLayer);
@@ -97,11 +97,22 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
                            dataDim,
                            /* trans= */ false,
                            useGpu_);
+
+  CHECK(tmpRow0 && tmpRow1 && tmpRow2 && tmpRow3 && tmpMtx0 && tmpMtx1);
+
+  createFunction(forward_,
+                 "CosSimForward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+  createFunction(backward_,
+                 "CosSimBackward",
+                 FuncConfig().set("scale", (real)config_.cos_scale()));
+
   return true;
 }
 
 void CosSimVecMatLayer::forward(PassType passType) {
   Layer::forward(passType);
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
@@ -117,17 +128,25 @@ void CosSimVecMatLayer::forward(PassType passType) {
   }
 
   MatrixPtr outV = getOutputValue();
-
+  CHECK(outV && inV0 && inV1);
   REGISTER_TIMER_INFO("FwCosVMTimer", getName().c_str());
   for (size_t i = 0; i < batchSize; i++) {
     tmpRow0->setData(inV0->rowBuf(i));
     tmpMtx0->setData(inV1->rowBuf(i));
     tmpRow2->setData(outV->rowBuf(i));
-    tmpRow2->cosSim(*(tmpMtx0), *(tmpRow0), config_.cos_scale());
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpRow2, ASSIGN_TO);
+    forward_[0]->calc(inputs, outputs);
   }
 }
 
 void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
+
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
   MatrixPtr inG0 = getInputGrad(0);
@@ -136,27 +155,27 @@ void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
   MatrixPtr outG = getOutputGrad();
 
   size_t batchSize = inV0->getHeight();
-
+  CHECK(inV0 && inV1 && inG0 && inG1 && outV && outG);
   REGISTER_TIMER_INFO("BwCosVMTimer", getName().c_str());
 
-  if (inG0 && inG1) {
-    for (size_t i = 0; i < batchSize; i++) {
-      tmpRow0->setData(inV0->rowBuf(i));
-      tmpRow1->setData(inG0->rowBuf(i));
-      tmpMtx0->setData(inV1->rowBuf(i));
-      tmpMtx1->setData(inG1->rowBuf(i));
-      tmpRow2->setData(outV->rowBuf(i));
-      tmpRow3->setData(outG->rowBuf(i));
-
-      tmpRow3->cosSimDerivative(*(tmpRow2),
-                                *(tmpMtx0),
-                                *(tmpRow0),
-                                *(tmpMtx1),
-                                *(tmpRow1),
-                                config_.cos_scale());
-    }
-  } else {
-    CHECK(!inG0 || !inG1) << "Not supported";
+  for (size_t i = 0; i < batchSize; i++) {
+    tmpRow0->setData(inV0->rowBuf(i));
+    tmpRow1->setData(inG0->rowBuf(i));
+    tmpMtx0->setData(inV1->rowBuf(i));
+    tmpMtx1->setData(inG1->rowBuf(i));
+    tmpRow2->setData(outV->rowBuf(i));
+    tmpRow3->setData(outG->rowBuf(i));
+
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*tmpRow3);
+    inputs.addArg(*tmpRow2);
+    inputs.addArg(*tmpMtx0);
+    inputs.addArg(*tmpRow0);
+    outputs.addArg(*tmpMtx1, ADD_TO);
+    outputs.addArg(*tmpRow1, ADD_TO);
+
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 7e9519f6b3af50bf47b660b285c3593087f80271..998b8d7d3034cb18fbab242c66656092bfc50fcb 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -367,8 +367,6 @@ void LambdaCost::backward(const UpdateCallback& callback) {
   getInputGrad(0)->add(*marginGrad_);
 }
 
-void LambdaCost::onPassEnd() {}
-
 void LambdaCost::calcGrad(const real* outputScore,
                           const real* score,
                           real* gradData,
@@ -611,14 +609,15 @@ class SumCostLayer : public Layer {
 public:
   explicit SumCostLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     if (!ret) return ret;
     CHECK_EQ(inputLayers_.size(), 1UL);
     return true;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     const MatrixPtr& input = getInputValue(0);
 
@@ -629,7 +628,7 @@ public:
     output_.value->sumRows(*input, /* scaleSum= */ 1, /* scaleDest= */ 0);
   }
 
-  virtual void backward(const UpdateCallback& callback = nullptr) {
+  void backward(const UpdateCallback& callback = nullptr) override {
     getInputGrad(0)->add((real)1);
   }
 };
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index 7f73bdb3f7d63ef1c8d76deb64f40d19d20f87c7..b3045e0b31308abf2caa90cbd21f105e685ef341 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -32,15 +32,16 @@ class CostLayer : public Layer {
 public:
   explicit CostLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
   LayerPtr getLabelLayer() { return inputLayers_[1]; }
 
-  virtual void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   virtual void forwardImp(Matrix& outputValue,
                           Argument& label,
@@ -68,11 +69,14 @@ public:
   explicit MultiClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -95,11 +99,14 @@ public:
   explicit MultiClassCrossEntropyWithSelfNorm(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
 protected:
   MatrixPtr sftMaxSum_;
@@ -117,11 +124,14 @@ public:
   explicit SoftBinaryClassCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
 protected:
   MatrixPtr targetPerDim_;
@@ -139,11 +149,14 @@ public:
   explicit SumOfSquaresCostLayer(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -162,17 +175,18 @@ class RankingCost : public Layer {
 public:
   explicit RankingCost(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer(size_t i) { return inputLayers_[i]; }
 
   LayerPtr getLabelLayer() { return inputLayers_[2]; }
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
   void forwardImp(Matrix& output, Argument& label, Matrix& cost) {
     (void)output;
@@ -214,17 +228,16 @@ class LambdaCost : public Layer {
 public:
   explicit LambdaCost(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
   LayerPtr getScoreLayer() { return inputLayers_[1]; }
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback = nullptr);
-
-  void onPassEnd();
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   real calcNDCG(const real* outputScore, const real* score, int size);
   void calcGrad(const real* outputScore,
@@ -256,11 +269,14 @@ public:
   explicit MultiBinaryLabelCrossEntropy(const LayerConfig& config)
       : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 };
 
 /**
@@ -282,13 +298,16 @@ class HuberTwoClass : public CostLayer {
 public:
   explicit HuberTwoClass(const LayerConfig& config) : CostLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forwardImp(Matrix& output, Argument& label, Matrix& cost);
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
 
   void forwardImpIn(Matrix& output, Argument& label, Matrix& cost);
 
-  void backwardImp(Matrix& outputValue, Argument& label, Matrix& outputGrad);
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
 
   void backwardImpIn(Matrix& outputValue, Argument& label, Matrix& outputGrad);
 };
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index b1e7d2082f1443313bfc858a17adfd737ecff98f..413efd4d3ecd734b343efbcf8328ac0592daddda 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -35,14 +35,15 @@ public:
 
   ~CudnnBatchNormLayer();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
   /**
    * reshape tensor of ioDesc_.
    */
   void reshape(int batchSize);
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index b869c695bd753076c6501a1253fcad22139ccadf..919b1efc4e453219a6c2ab1a11c61ccb99404084 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -45,9 +45,10 @@ public:
 
   ~CudnnConvLayer();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
   void addBiases();
   void bpropBiases();
 };
diff --git a/paddle/gserver/layers/CudnnPoolLayer.h b/paddle/gserver/layers/CudnnPoolLayer.h
index 072b2f9513f4ef8aed03ecfa7a9014667bb2ce9e..f0aa22fe3af90c9233330c15fc56c3696a624446 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
@@ -45,7 +45,8 @@ public:
                         hl_pooling_mode_t* mode = nullptr);
   explicit CudnnPoolLayer(const LayerConfig& config);
   ~CudnnPoolLayer();
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   /**
    * Reshape input and output tensor descriptor.
@@ -53,8 +54,8 @@ public:
    * So reshaping is needed.
    */
   void reshape(int batchSize);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/DataLayer.h b/paddle/gserver/layers/DataLayer.h
index d3bc97bb6cd0b8faf8ae108a0147d77854596e25..a9cf1f943c260a934564a19aecda28c24ccff43c 100644
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
@@ -33,13 +33,13 @@ public:
   /**
    * Prefetch sparse matrix/ids only.
    */
-  void prefetch() { output_ = data_; }
+  void prefetch() override { output_ = data_; }
 
   /**
    * Forward propagation. Copy data_ (value, in, grad, ids, cpuSequenceDims,
    * sequenceStartPositions, subSequenceStartPositions, strs) to output_.
    */
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     copyDataToOutput(output_);
     if (FLAGS_show_layer_stat) {
@@ -50,9 +50,9 @@ public:
   /**
    * Data layer's backward propagation do nothing.
    */
-  virtual void backward(const UpdateCallback& callback) { (void)callback; }
+  void backward(const UpdateCallback& callback) override { (void)callback; }
 
-  virtual void copyOutputToOtherDevice() {
+  void copyOutputToOtherDevice() override {
     for (size_t i = 0; i != outputOtherDevice_.size(); i++) {
       copyDataToOutput(outputOtherDevice_[i]);
     }
diff --git a/paddle/gserver/layers/DataNormLayer.h b/paddle/gserver/layers/DataNormLayer.h
index b3043cffd210feaf9ddaed096de762aa7e2a6139..f0fd044e5b83430a4028a227c7d5a31b6fa86f20 100644
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
@@ -44,10 +44,11 @@ public:
 
   ~DataNormLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
   int mode_;
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index fa53e2e4cfc8a220eeb2a637d7fe759f1744f9d5..686f1fa0543cb3629ac223316e595e642a9e7d76 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -27,14 +27,14 @@ class EosIdCheckLayer : public Layer {
 public:
   explicit EosIdCheckLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
     return ret;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
 
     const Argument& input = getInput(0);
@@ -42,7 +42,7 @@ public:
     output_.ids->isEqualTo(*input.ids, config_.eos_id());
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(eos_id, EosIdCheckLayer);
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index 8445642217cf3e83441ddd9beec80f99faf946bc..aabcdfc392d3e242df84c820c336d8b32c7cb04f 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -48,7 +48,8 @@ public:
 
   ~ExpandConvBaseLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   size_t getOutputSize();
   /**
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index de81a017e1bac38a5717e8c83a028f5408c0e084..60681690e5dd55b2e9aa4e1f25758db6033665a6 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -35,10 +35,11 @@ public:
 
   ~ExpandConvLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index 4a527d67995e255c65fea1f310551f8de5630030..00b8f241889fdd3f423d75dedd9068aa3674f190 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -34,10 +34,11 @@ public:
 
   ~ExpandConvTransLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ExpandLayer.h b/paddle/gserver/layers/ExpandLayer.h
index 5c636144235cdb3800aa921464985616f8ee9203..c21b3350e2bc4b136eaf50f96799f479a13df6bd 100644
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -53,10 +53,11 @@ public:
 
   ~ExpandLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/FeatureMapExpandLayer.cpp b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
index d023074c52167554358d0d4df7ec40cfba9da2a6..b3850f543af74abbddaac5bb0a32851f2d3297d0 100644
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
@@ -46,10 +46,11 @@ public:
 
   ~FeatureMapExpandLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(featmap_expand, FeatureMapExpandLayer);
@@ -95,6 +96,9 @@ void FeatureMapExpandLayer::forward(PassType passType) {
 
 void FeatureMapExpandLayer::backward(const UpdateCallback& callback) {
   MatrixPtr inGrad = getInputGrad(0);
+  if (NULL == inGrad) {
+    return;
+  }
   MatrixPtr outGrad = getOutputGrad();
   size_t batchSize = getInput(0).getBatchSize();
   int imgSize = inGrad->getWidth();
diff --git a/paddle/gserver/layers/FullyConnectedLayer.h b/paddle/gserver/layers/FullyConnectedLayer.h
index ccd584585c97cb679332cbd10d6f3a1306ca5a54..64e7a050125aa92b414e58c7678bf87efd01103f 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
@@ -36,13 +36,14 @@ public:
   explicit FullyConnectedLayer(const LayerConfig& config) : Layer(config) {}
   ~FullyConnectedLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
-  void prefetch();
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index 930d9a056164e7c677adb53b7b67901364da1309..d3aeea921801da301b2829736059130aec14cef6 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -314,13 +314,13 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
 
   batchValue_->resizeOrCreate(*output_.value);
   batchValue_->copy(*inputValue, *gate_.value, /* seq2batch */ true);
-  if (bias_ && bias_->getWGrad()) {
+  if (bias_) {
     gate_.value->addBias(*(bias_->getW()), 1);
   }
 
   {
     int numBatch = batchValue_->getNumBatch();
-    int batchSize = 0;
+    int curBatchSize = 0;
     AsyncGpuBlock asyncGpuBlock;
     for (int n = 0; n < numBatch; n++) {
       MatrixPtr outputValueTmp = batchValue_->getBatchValue(n);
@@ -330,16 +330,17 @@ void GatedRecurrentLayer::forwardBatch(int batchSize,
       gruValue.resetOutputValue =
           (batchValue_->getBatchValue(*resetOutput_.value, n))->getData();
 
-      batchSize = outputValueTmp->getHeight();
+      curBatchSize = outputValueTmp->getHeight();
       gruValue.prevOutValue =
-          (n == 0 ? nullptr
-                  : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
+          (n == 0
+               ? nullptr
+               : (batchValue_->getBatchValue(n - 1, curBatchSize))->getData());
 
       {
         if (useGpu_) {
-          GruCompute::forward<1>(gruValue, getSize(), batchSize);
+          GruCompute::forward<1>(gruValue, getSize(), curBatchSize);
         } else {
-          GruCompute::forward<0>(gruValue, getSize(), batchSize);
+          GruCompute::forward<0>(gruValue, getSize(), curBatchSize);
         }
       }
     }
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index 25770ce57fbaa4d16c9454d824800f2f0c7f957d..58dd760eb870e9570f8a406f098f69c5fdf6477a 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -50,17 +50,18 @@ class GatedRecurrentLayer : public Layer, public GruCompute {
 public:
   explicit GatedRecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   void forwardSequence(int batchSize,
diff --git a/paddle/gserver/layers/GetOutputLayer.cpp b/paddle/gserver/layers/GetOutputLayer.cpp
index b77fdbb30e11b72b0c7de765df173204aa0b6851..4e29efd4612b18e655ba7674a3fd7890ce3f0e79 100644
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
@@ -22,17 +22,18 @@ public:
 
   ~GetOutputLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     if (!Layer::init(layerMap, parameterMap)) return false;
     CHECK_EQ(1U, inputLayers_.size());
     CHECK_NE(inputArgument_[0], "");
     return true;
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     output_ = getPrev(0)->getOutput(inputArgument_[0]);
   }
-  void backward(const UpdateCallback& callback = nullptr) {}
+  void backward(const UpdateCallback& callback = nullptr) override {}
 };
 
 REGISTER_LAYER(get_output, GetOutputLayer);
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index a56af21317d1d43c836f7fe599a4dc614804bfec..3340e38e62cc396fd619cfa2a1fad57b0a8cf4c7 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index 4a1006aa941f396c233a0cecfc38228f1f9fafe1..5b5cb25f9269a30f79d602b342411d0e6bfa429b 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -55,10 +55,11 @@ public:
 
   ~GruStepLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(gru_step, GruStepLayer);
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 70da3ac126e147387b20c5a97d0116a5a679e044..3f6875fb9f007c0938bfcd7cad99c73b4ba1511b 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -61,9 +61,10 @@ class HierarchicalSigmoidLayer : public Layer {
 public:
   explicit HierarchicalSigmoidLayer(const LayerConfig& config)
       : Layer(config) {}
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 44fe1fb1fea4203a4a1cac67c581b13adda65966..eac7428571980baf6b2ddb8b2cc85b9c98afa5d6 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -43,10 +43,11 @@ public:
 
   ~InterpolationLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(interpolation, InterpolationLayer);
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index c47943f81c01589eada4b825d54be5c69314b6fa..f76d41ad3e8a3b1730f9d50c0773ee4f61ddb541 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Error.h"
 #include "paddle/utils/Logging.h"
 
 #include "AddtoLayer.h"
@@ -334,7 +335,8 @@ void Layer::showOutputStats() {
 
 void Layer::forwardActivation() {
   /* activation */
-  activation_->forward(output_);
+  auto status = activation_->forward(output_);
+  status.check();
 
   /* dropout */
   if (config_.drop_rate() > 0) {
@@ -372,7 +374,8 @@ void Layer::backwardActivation() {
     oGrad->dotMul(*oGrad, *dropOutMask_);
   }
 
-  activation_->backward(output_);
+  auto status = activation_->backward(output_);
+  status.check();
 }
 
 void Layer::forwardDropOut() {
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 6dfd48fb96618102b71e9f6de79a348dc7f62647..7c4bea072157aac17787afab184b51c09ff656f2 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -311,6 +311,7 @@ public:
         return *output->second;
       } else {
         LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
       }
     }
   }
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 0d65b4158ebdc04f199048bbba98317c89fc8beb..2588fad2793961da2b2af889e8985f49540f1bda 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index f49df2c412f05f74da455d41cdf7c9bd4b9ec2e2..c45a52d2e9aaf41a8e02495cc2deae60ab13650a 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -74,17 +74,18 @@ class LstmLayer : public Layer, public LstmCompute {
 public:
   explicit LstmLayer(const LayerConfig &config) : Layer(config) {}
 
-  bool init(const LayerMap &layerMap, const ParameterMap &parameterMap);
+  bool init(const LayerMap &layerMap,
+            const ParameterMap &parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback &callback);
+  void backward(const UpdateCallback &callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   /**
diff --git a/paddle/gserver/layers/LstmStepLayer.cpp b/paddle/gserver/layers/LstmStepLayer.cpp
index 5fc6474b8653f4c7dac284e11d88f803405169a3..568277a90c62c73a811dcbf66782a4bdc4021b81 100644
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
@@ -35,10 +35,11 @@ public:
 
   ~LstmStepLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(lstm_step, LstmStepLayer);
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index fb41af563195496a57eafcc52b49eadac697fa0a..be0f2a07d4aae253b7b18dbe406c4b94bf96bc8e 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -181,11 +181,12 @@ class MDLstmLayer : public LstmLayer {
 public:
   explicit MDLstmLayer(const LayerConfig& config) : LstmLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   void forwardOneSequence(int start, CoordIterator& coordIter);
@@ -506,9 +507,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
           *frameState_[start + preOffsetV[i]].value, *checkFgOneDim, 1.0, 1.0);
     }
   }
-  activationGate_->forward(frameInputGate_[idxCurr]);
-  activationGate_->forward(frameForgetGate_[idxCurr]);
-  activation_->forward(frameInputNode_[idxCurr]);
+  auto status = activationGate_->forward(frameInputGate_[idxCurr]);
+  status.check();
+  status = activationGate_->forward(frameForgetGate_[idxCurr]);
+  status.check();
+  status = activation_->forward(frameInputNode_[idxCurr]);
+  status.check();
 
   frameState_[idxCurr].value->zeroMem();
   for (int i = 0; i < numDims_; i++) {
@@ -530,10 +534,12 @@ void MDLstmLayer::forwardGate2OutputSequence(int start,
 
   frameOutputGate_[idxCurr].value->addDotMul(
       *frameState_[idxCurr].value, *checkOg_, 1.0, 1.0);
-  activationGate_->forward(frameOutputGate_[idxCurr]);
+  status = activationGate_->forward(frameOutputGate_[idxCurr]);
+  status.check();
 
   framePreOutput_[idxCurr].value->copyFrom(*(frameState_[idxCurr].value));
-  activationState_->forward(framePreOutput_[idxCurr]);
+  status = activationState_->forward(framePreOutput_[idxCurr]);
+  status.check();
 
   frameOutput_[idxCurr].value->dotMul(*framePreOutput_[idxCurr].value,
                                       *frameOutputGate_[idxCurr].value);
@@ -640,12 +646,12 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
 
   framePreOutput_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                         *frameOutputGate_[idxCurr].value);
-  activationState_->backward(framePreOutput_[idxCurr]);
+  activationState_->backward(framePreOutput_[idxCurr]).check();
   frameState_[idxCurr].grad->copyFrom(*(framePreOutput_[idxCurr].grad));
 
   frameOutputGate_[idxCurr].grad->dotMul(*frameOutput_[idxCurr].grad,
                                          *framePreOutput_[idxCurr].value);
-  activationGate_->backward(frameOutputGate_[idxCurr]);
+  activationGate_->backward(frameOutputGate_[idxCurr]).check();
 
   frameState_[idxCurr].grad->addDotMul(
       *frameOutputGate_[idxCurr].grad, *checkOg_, 1.0, 1.0);
@@ -702,9 +708,9 @@ void MDLstmLayer::backwardGate2OutputSequence(int start,
     }
   }
 
-  activationGate_->backward(frameInputGate_[idxCurr]);
-  activationGate_->backward(frameForgetGate_[idxCurr]);
-  activation_->backward(frameInputNode_[idxCurr]);
+  activationGate_->backward(frameInputGate_[idxCurr]).check();
+  activationGate_->backward(frameForgetGate_[idxCurr]).check();
+  activation_->backward(frameInputNode_[idxCurr]).check();
 
   if (bias_->getWGrad()) {
     for (int i = 0; i < numDims_; i++) {
diff --git a/paddle/gserver/layers/MaxIdLayer.cpp b/paddle/gserver/layers/MaxIdLayer.cpp
index 80555f3f7b324100c059c3356a4a2e462bc6face..9e72b167cd963ae4928bf85503214dd7cee31148 100644
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
@@ -30,8 +30,8 @@ private:
 public:
   explicit MaxIdLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
 
@@ -40,7 +40,7 @@ public:
     return ret;
   }
 
-  virtual void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     const Argument& input = getInput(0);
     size_t batchSize = input.getBatchSize();
@@ -54,7 +54,7 @@ public:
     input.value->rowMax(*output_.ids, *output_.in);
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(maxid, MaxIdLayer);
diff --git a/paddle/gserver/layers/MaxLayer.h b/paddle/gserver/layers/MaxLayer.h
index 472ee0ccca196250f4b81fc1e921aaee5f352b7e..baa58ca2d7a6970f0d2f3ef6f8609404c82efa30 100644
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -42,14 +42,13 @@ protected:
 public:
   explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
 
-  ~MaxLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     return SequencePoolLayer::init(layerMap, parameterMap);
   }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MaxOutLayer.h b/paddle/gserver/layers/MaxOutLayer.h
index 59c2245e0d6490d4f8e1b77b1c88267747aaa63a..73fd8536be56b2c620fbfdea1937f3acd593bf05 100644
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
@@ -45,10 +45,11 @@ public:
   explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
   virtual ~MaxOutLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 9655a152c7bc96fb3941fcbd9db4ff71a59e4ebe..755c9deb8b1be34b6f44a7b30b107f99102a3853 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -35,21 +35,22 @@ public:
 
   ~MixedLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  virtual void prefetch();
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
-  virtual void resetState();
+  void prefetch() override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+  void resetState() override;
   /**
    * setState() should be called after getState().
    * Argument state consists of all projections states.
    */
-  virtual void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
   /**
    * Return state which consists of all projections states.
    */
-  virtual LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   std::vector<std::unique_ptr<Projection>> projections_;
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index b48073c80b6f57cd86ceb80b9d749548c3acc1ac..546ef9c1f24d1bc8abe68ba8b2fe6ab55f4b03e5 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <random>
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index d09720c5255747df11d4d7367f67a245e63e6846..297972b3cd9e4dfba94e2597053ab7c7c560c9dd 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -69,10 +69,11 @@ public:
 
   ~MultiplexLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 private:
   /**
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 5ab765247f63dfe6e6651ca4d27dc7183a9f33e1..0bc2ef11829337d9b765ef00066289494eb984b3 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -61,7 +61,8 @@ public:
         rand_(0, config.num_classes() - 1),
         prepared_(false) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     /* Initialize the basic parent class */
     Layer::init(layerMap, parameterMap);
 
@@ -146,7 +147,7 @@ public:
     prepared_ = true;
   }
 
-  void prefetch() {
+  void prefetch() override {
     prepareSamples();
     IVector::resizeOrCreate(labelIds_, samples_.size(), useGpu_);
     int* ids = labelIds_->getData();
@@ -163,7 +164,7 @@ public:
     }
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
 
     CHECK(!useGpu_) << "GPU is not supported";
@@ -193,12 +194,13 @@ public:
       forwardOneInput(l);
     }
 
-    activation_->forward(sampleOut_);
+    auto status = activation_->forward(sampleOut_);
+    status.check();
 
     forwardCost();
   }
 
-  void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     Matrix::resizeOrCreate(sampleOut_.grad,
                            1,
                            samples_.size(),
@@ -207,7 +209,8 @@ public:
 
     backwardCost();
 
-    activation_->backward(sampleOut_);
+    auto status = activation_->backward(sampleOut_);
+    status.check();
 
     if (biases_->getWGrad()) {
       backwardBias(callback);
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index 011bab8fdedab00b336290a245b82de07496b554..e77faaa322570933b3ea2de877b7859857306432 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -30,7 +30,8 @@ class NormLayer : public Layer {
 public:
   explicit NormLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     Layer::init(layerMap, parameterMap);
     return true;
   }
@@ -56,9 +57,10 @@ protected:
 public:
   explicit ResponseNormLayer(const LayerConfig& config) : NormLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType) { LOG(FATAL) << "Not implemented"; }
-  void backward(const UpdateCallback& callback = nullptr) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override { LOG(FATAL) << "Not implemented"; }
+  void backward(const UpdateCallback& callback = nullptr) override {
     LOG(FATAL) << "Not implemented";
   }
 };
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index 262d757c67e105a8d65619eed91de65d34cfe35e..4331009de7e98d2326049e563e46a55a20366507 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -59,7 +59,6 @@ bool CMRProjectionNormLayer::init(const LayerMap& layerMap,
 
 void CMRProjectionNormLayer::forward(PassType passType) {
   Layer::forward(passType);
-
   /* malloc memory for the output_ if necessary */
   /* note: one sample correspond to one row */
   MatrixPtr input = inputLayers_[0]->getOutputValue();
@@ -67,34 +66,36 @@ void CMRProjectionNormLayer::forward(PassType passType) {
   int size = getSize();
   resetOutput(batchSize, size);
 
-  MatrixPtr outV = getOutputValue();
-
   Matrix::resizeOrCreate(denoms_, batchSize, size, /* trans */ false, useGpu_);
 
-  dims_ = {batchSize, channels_, imgSizeH_, imgSizeW_};
-  forward_[0]->calc(
-      {Tensor(input->getData(), dims_)},
-      {Tensor(outV->getData(), dims_), Tensor(denoms_->getData(), dims_)},
-      {});
+  shape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+
+  // prepare forward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  outputs.addArg(*getOutputValue(), shape_, ASSIGN_TO);
+  outputs.addArg(*denoms_, shape_, ASSIGN_TO);
+
+  forward_[0]->calc(inputs, outputs);
 }
 
 void CMRProjectionNormLayer::backward(const UpdateCallback& callback) {
   (void)callback;
 
-  if (NULL == inputLayers_[0]->getOutputGrad()) {
+  if (NULL == getInputGrad(0)) {
     return;
   }
-  /* Do derivation */
-  MatrixPtr preOutGrad = inputLayers_[0]->getOutputGrad();
-  MatrixPtr localGrad = getOutputGrad();
-  MatrixPtr localOutV = getOutputValue();
-  MatrixPtr preOutV = inputLayers_[0]->getOutputValue();
-
-  backward_[0]->calc({Tensor(preOutV->getData(), dims_),
-                      Tensor(localOutV->getData(), dims_),
-                      Tensor(localGrad->getData(), dims_),
-                      Tensor(denoms_->getData(), dims_)},
-                     {Tensor(preOutGrad->getData(), dims_)},
-                     {});
+
+  // prepare backward arguments
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), shape_);
+  inputs.addArg(*getOutputValue(), shape_);
+  inputs.addArg(*getOutputGrad(), shape_);
+  inputs.addArg(*denoms_, shape_);
+  outputs.addArg(*getInputGrad(0), shape_, ADD_TO);
+
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 6b2c5dde0d74db4b292d5006d19ce54d3194017e..2997ae8848c438fa13037ccf03c1faca9ad73224 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -36,11 +36,12 @@ public:
 
   size_t getSize();
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 protected:
-  Dims dims_;
+  TensorShape shape_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index b606e4436567eb2a8df9fd501a2af8c8aa1d2fdf..283fdb003a2bb9474eac7a379ceb2c02027cfc5f 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -38,10 +38,11 @@ public:
 
   ~OuterProdLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(out_prod, OuterProdLayer);
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bb618c09f9777785d93995fa7140dd4a5383cd1b
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "PadLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(pad, PadLayer);
+
+bool PadLayer::init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  auto& pad_conf = config_.inputs(0).pad_conf();
+  auto& img_conf = pad_conf.image_conf();
+  CHECK_EQ(config_.inputs_size(), 1);
+  inDims_ = TensorShape(
+      {0,
+       img_conf.channels(),
+       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
+       img_conf.img_size()});
+
+  CHECK_EQ(2, pad_conf.pad_c_size());
+  CHECK_EQ(2, pad_conf.pad_h_size());
+  CHECK_EQ(2, pad_conf.pad_w_size());
+  padc_.push_back(pad_conf.pad_c(0));
+  padc_.push_back(pad_conf.pad_c(1));
+  padh_.push_back(pad_conf.pad_h(0));
+  padh_.push_back(pad_conf.pad_h(1));
+  padw_.push_back(pad_conf.pad_w(0));
+  padw_.push_back(pad_conf.pad_w(1));
+
+  outDims_ = TensorShape(4);
+  setOutDims(0);
+
+  createFunction(forward_,
+                 "Pad",
+                 FuncConfig()
+                     .set("cstart", padc_[0])
+                     .set("cend", padc_[1])
+                     .set("hstart", padh_[0])
+                     .set("hend", padh_[1])
+                     .set("wstart", padw_[0])
+                     .set("wend", padw_[1]));
+  createFunction(backward_,
+                 "PadGrad",
+                 FuncConfig()
+                     .set("cstart", padc_[0])
+                     .set("cend", padc_[1])
+                     .set("hstart", padh_[0])
+                     .set("hend", padh_[1])
+                     .set("wstart", padw_[0])
+                     .set("wend", padw_[1]));
+
+  return true;
+}
+
+void PadLayer::setOutDims(const size_t batchSize) {
+  outDims_.reshape({batchSize,
+                    inDims_[1] + padc_[0] + padc_[1],
+                    inDims_[2] + padh_[0] + padh_[1],
+                    inDims_[3] + padw_[0] + padw_[1]});
+}
+
+void PadLayer::setTensorDim(const size_t batchSize) {
+  CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
+  inDims_.setDim(0, batchSize);
+  int h = inputLayers_[0]->getOutput().getFrameHeight();
+  if (h != 0) inDims_.setDim(2, h);
+  int w = inputLayers_[0]->getOutput().getFrameWidth();
+  if (w != 0) inDims_.setDim(3, w);
+  setOutDims(batchSize);
+}
+
+void PadLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = inputLayers_[0]->getOutputValue();
+  size_t batchSize = input->getHeight();
+  setTensorDim(batchSize);
+  int size = outDims_[1] * outDims_[2] * outDims_[3];
+  resetOutput(batchSize, size);
+  MatrixPtr outV = getOutputValue();
+  REGISTER_TIMER_INFO("PadForward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+}
+
+void PadLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  REGISTER_TIMER_INFO("PadBackward", getName().c_str());
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b2bbf28082e630aeb429ee997a1d43ce7ba05d1c
--- /dev/null
+++ b/paddle/gserver/layers/PadLayer.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief  This layer pads zeros to inputs according to the specify dimension.
+ *         The input and output is a 4D tensor. Padding zeros from the 2nd to
+ *         the 4th dimenstion according padc_, padh_ and padw_.
+ */
+class PadLayer : public Layer {
+public:
+  explicit PadLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~PadLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  void setOutDims(const size_t batchSize);
+  void setTensorDim(const size_t batchSize);
+
+  std::vector<int> padc_;
+  std::vector<int> padh_;
+  std::vector<int> padw_;
+  TensorShape inDims_;
+  TensorShape outDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ParameterReluLayer.h b/paddle/gserver/layers/ParameterReluLayer.h
index a82497fc01ca1f63719a905c7545911a7e05289b..9a11b81ebf1f5c06355fc107b00aa69b65148ed5 100644
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
@@ -56,9 +56,10 @@ public:
 
   ~ParameterReluLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index 318b89d7c2bce896d183eba8c48c230d962918a5..d43292ad2d4bbe1229ca59ca21bee92c9ec006a3 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -46,7 +46,8 @@ public:
    */
   static Layer* create(const LayerConfig& config);
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PoolProjectionLayer.h b/paddle/gserver/layers/PoolProjectionLayer.h
index 3dc6af2f0e9fb1a12eca7bc0c531a2e7b151fb8a..e31116de8ccb1f6b847c9fff47961bedfad1a79c 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
@@ -40,7 +40,7 @@ public:
 
   size_t getSize();
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index 64fecab5b08354ceea8b290b78eede72d24a98a2..31c34b43e2995a2bf7f4d16629a8172a7e76c8e1 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -40,10 +40,11 @@ public:
 
   ~PowerLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(power, PowerLayer);
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index ac7f658864fee6812ea89d1dbd84ad4db94e3035..de198af111be4200dd1b240f6de9464e3f43b06d 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -19,38 +19,17 @@ namespace paddle {
 class PrintLayer : public Layer {
 public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {}
-};
 
-void PrintLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const auto& argu = getInput(i);
-    const std::string& name = inputLayers_[i]->getName();
-    if (argu.value) {
-      std::ostringstream os;
-      argu.value->print(os);
-      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-    }
-    if (argu.ids) {
-      std::ostringstream os;
-      argu.ids->print(os, argu.ids->getSize());
-      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-    }
-    if (auto startPos = argu.sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-    }
-    if (auto subStartPos = argu.subSequenceStartPositions) {
-      std::ostringstream os;
-      subStartPos->getVector(false)->print(os, subStartPos->getSize());
-      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                << os.str();
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      getInput(i).printValueString(LOG(INFO),
+                                   "layer=" + inputLayers_[i]->getName() + " ");
     }
   }
-}
+
+  void backward(const UpdateCallback& callback) override {}
+};
 
 REGISTER_LAYER(print, PrintLayer);
 
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index 36ace7597cd66cc2d83353ec999a75c79dd1e33e..bcf5e912a50fef2cec8ebdf1e0dad9efa43fba2f 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -30,10 +30,11 @@ namespace paddle {
 class PriorBoxLayer : public Layer {
 public:
   explicit PriorBoxLayer(const LayerConfig& config) : Layer(config) {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback) {}
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override {}
 
 protected:
   int numPriors_;
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 8cd8042479eafdbd6b8dac03b63b344fcf9526b1..778a7fe13d8a2b669831396e69546446b4745e61 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -88,11 +88,37 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
   /**
    * Get output size of projection.
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
+protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
 protected:
   /// Config of projection
   ProjectionConfig config_;
@@ -106,5 +132,9 @@ protected:
   const Argument* out_;
   /// Store `passType` passed to forward()
   PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 55e0fdfb9048c02b2dcd474c6887eee180328260..e4c2b483d2fa4032735858dab17647592791a9c7 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -45,17 +45,18 @@ class RecurrentLayer : public Layer {
 public:
   explicit RecurrentLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 
-  void resetState();
+  void resetState() override;
 
-  void setState(LayerStatePtr state);
+  void setState(LayerStatePtr state) override;
 
-  LayerStatePtr getState();
+  LayerStatePtr getState() override;
 
 protected:
   /**
@@ -217,21 +218,22 @@ void RecurrentLayer::forwardOneSequence(int start, int length) {
     if (prevOutput_) {
       frameOutput_[start].value->mul(*prevOutput_, *weight_->getW(), 1, 1);
     }
-    activation_->forward(frameOutput_[start]);
+    activation_->forward(frameOutput_[start]).check();
+
     for (int i = 1; i < length; ++i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i - 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
     if (prevOutput_) {
       prevOutput_->assign(*frameOutput_[start + length - 1].value);
     }
   } else {
-    activation_->forward(frameOutput_[start + length - 1]);
+    activation_->forward(frameOutput_[start + length - 1]).check();
     for (int i = length - 2; i >= 0; --i) {
       frameOutput_[start + i].value->mul(
           *frameOutput_[start + i + 1].value, *weight_->getW(), 1, 1);
-      activation_->forward(frameOutput_[start + i]);
+      activation_->forward(frameOutput_[start + i]).check();
     }
   }
 }
@@ -280,11 +282,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
   MatrixPtr weightT = weight_->getW()->getTranspose();
   if (!reversed_) {
     for (int i = length - 1; i > 0; --i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i - 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start]);
+    activation_->backward(frameOutput_[start]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start, length - 1)->getTranspose(),
@@ -294,11 +296,11 @@ void RecurrentLayer::backwardOneSequence(int start, int length) {
     }
   } else {
     for (int i = 0; i < length - 1; ++i) {
-      activation_->backward(frameOutput_[start + i]);
+      activation_->backward(frameOutput_[start + i]).check();
       frameOutput_[start + i + 1].grad->mul(
           *frameOutput_[start + i].grad, *weightT, 1, 1);
     }
-    activation_->backward(frameOutput_[start + length - 1]);
+    activation_->backward(frameOutput_[start + length - 1]).check();
     if (weight_->getWGrad()) {
       weight_->getWGrad()->mul(
           *output_.value->subMatrix(start + 1, length - 1)->getTranspose(),
@@ -333,7 +335,7 @@ void RecurrentLayer::forwardBatch(int batchSize,
       }
       Argument arg;
       arg.value = batch2;
-      activation_->forward(arg);
+      activation_->forward(arg).check();
     }
   }
   batchValue_->copyBackSeq(*output_.value);
@@ -363,7 +365,7 @@ void RecurrentLayer::backwardBatch(int batchSize,
       Argument arg;
       arg.value = batch1;
       arg.grad = batch2;
-      activation_->backward(arg);
+      activation_->backward(arg).check();
 
       if (n != 0) {
         batch1 = batchGrad_->getBatchValue(n - 1, batch2->getHeight());
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index af8dd61d84e2e53ca26dc054d0516e62ab7aa216..78a74ff19a38cd205f3a46900bf716e2e1b1e4d5 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -33,15 +33,15 @@ public:
   void initSubNetwork(NeuralNetwork* rootNetwork,
                       const ModelConfig& config,
                       const std::vector<ParameterType>& parameterTypes,
-                      bool useGpu);
+                      bool useGpu) override;
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     REGISTER_TIMER_INFO("RecurrentGroupFwTime", getName().c_str());
     const std::vector<Argument> inArgs;
     std::vector<Argument> outArgs;
     network_->forward(inArgs, &outArgs, passType);
   }
-  void backward(const UpdateCallback& callback) {
+  void backward(const UpdateCallback& callback) override {
     REGISTER_TIMER_INFO("RecurrentGroupBwTime", getName().c_str());
     network_->backward(nullptr);
 
@@ -53,7 +53,8 @@ public:
   /**
    * @see Layer.accessSubNetwork
    */
-  void accessSubNetwork(const std::function<void(NeuralNetwork&)>& callback) {
+  void accessSubNetwork(
+      const std::function<void(NeuralNetwork&)>& callback) override {
     callback(*network_);
   }
 
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index 7fcb3adea01b9d16394ee90b751b10902dc3a190..eb3b63c106901f89dd75cc2a495477b240d40e3c 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -20,18 +20,19 @@ namespace paddle {
 /**
  * @brief A layer for resizing a minibatch matrix h*w to h'*w'
  * @note
- * origin matrix height * witdth)
+ * origin matrix height * width)
  * resize matrix: (height * width / size) * size
  */
 class ResizeLayer : public Layer {
 public:
   explicit ResizeLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  void backward(const UpdateCallback& callback);
+  void backward(const UpdateCallback& callback) override;
 };
 
 REGISTER_LAYER(resize, ResizeLayer);
diff --git a/paddle/gserver/layers/RotateLayer.cpp b/paddle/gserver/layers/RotateLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c71088d786ab218bf0f71b577985c023dd1436f
--- /dev/null
+++ b/paddle/gserver/layers/RotateLayer.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RotateLayer.h"
+
+namespace paddle {
+
+REGISTER_LAYER(rotate, RotateLayer);
+
+bool RotateLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  height_ = config_.height();
+  width_ = config_.width();
+  CHECK_GT(height_, 0);
+  CHECK_GT(width_, 0);
+  return true;
+}
+
+void RotateLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  MatrixPtr input = getInputValue(0);
+  batchSize_ = input->getHeight();
+  size_ = input->getWidth();
+  CHECK_GE(size_, height_ * width_);
+  CHECK_EQ(size_ % (height_ * width_), 0)
+      << "total size_ is not dividable by (height_ * width_), i.e., "
+      << "channel number should be an integer";
+  channels_ = size_ / (height_ * width_);
+
+  resizeOutput(batchSize_, size_);
+
+  MatrixPtr outV = getOutputValue();
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSample =
+          Matrix::create(input->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSample =
+          Matrix::create(outV->getData() + b * size_ + c * height_ * width_,
+                         width_,
+                         height_,
+                         false,
+                         useGpu_);
+      inputSample->rotate(outputSample, false, true /* clock-wise */);
+    }
+  }
+
+  if (getInputGrad(0)) {
+    zeroGrad();
+  }
+}
+
+void RotateLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+
+  MatrixPtr outputGrad = getOutputGrad();
+  if (outputGrad == NULL) {
+    return;
+  }
+  // the grad should be rotated in the reverse direction
+  MatrixPtr preGrad = getInputGrad(0);
+
+  for (int b = 0; b < batchSize_; b++) {   // for each input feat map
+    for (int c = 0; c < channels_; c++) {  // for each feat channel
+      MatrixPtr inputSampleGrad =
+          Matrix::create(preGrad->getData() + b * size_ + c * height_ * width_,
+                         height_,
+                         width_,
+                         false,
+                         useGpu_);
+      MatrixPtr outputSampleGrad = Matrix::create(
+          outputGrad->getData() + b * size_ + c * height_ * width_,
+          width_,
+          height_,
+          false,
+          useGpu_);
+      MatrixPtr tmpGrad = nullptr;
+      outputSampleGrad->rotate(tmpGrad, true, false /* anti clock-wise */);
+      inputSampleGrad->add(*tmpGrad);
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a64d4d5a51d9c04df07861f02f1bb91eaec088e
--- /dev/null
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+/**
+ * A layer for rotating a multi-channel feature map (M x N x C) in the spatial
+ * domain
+ * The rotation is 90 degrees in clock-wise for each channel
+ * \f[
+ *   y(j,i,:) = x(M-i-1,j,:)
+ * \f]
+ * where \f$x\f$ is (M x N x C) input, and \f$y\f$ is (N x M x C) output.
+ *
+ * The config file api is rotate_layer
+ *
+*/
+
+class RotateLayer : public Layer {
+public:
+  explicit RotateLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+
+private:
+  int batchSize_;
+  int size_;
+  int height_;
+  int width_;
+  int channels_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/SamplingIdLayer.cpp b/paddle/gserver/layers/SamplingIdLayer.cpp
index 59ff5d41b529099277375cd5e1b498f3331c3b0a..2538d99bb71fa1ce6546730b817a49347fe3c5d8 100644
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
@@ -35,8 +35,8 @@ public:
   explicit SamplingIdLayer(const LayerConfig& config)
       : Layer(config), rand1_(0, 1) {}
 
-  virtual bool init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override {
     bool ret = Layer::init(layerMap, parameterMap);
     CHECK_EQ(1UL, inputLayers_.size());
     if (useGpu_) {
@@ -48,7 +48,7 @@ public:
     return ret;
   }
 
-  void forward(PassType passType) {
+  void forward(PassType passType) override {
     Layer::forward(passType);
     if (useGpu_) {
       for (size_t i = 0; i < inputLayers_.size(); i++) {
@@ -83,7 +83,7 @@ public:
     output_.ids->copyFrom(ids.data(), batchSize);
   }
 
-  virtual void backward(const UpdateCallback& callback) {}
+  void backward(const UpdateCallback& callback) override {}
 };
 
 REGISTER_LAYER(sampling_id, SamplingIdLayer);
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 7f0084be6b57f5ce8245609e64c744c1a049a925..a38ee0857a767981eb24e79e96bf6115e9c63720 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -37,10 +37,11 @@ public:
 
   ~ScalingLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(scaling, ScalingLayer);
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 5eacff6b7143996130bea64766ef42c66f4c7310..d9a91de8a6f4daf514f089a3d63cb519223bfdd0 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -192,7 +192,8 @@ void SelectiveFullyConnectedLayer::forward(PassType passType) {
                                nnz,
                                /*trans=*/false,
                                /*useGpu=*/useGpu_);
-    activation_->forward(arg);
+    //! TODO(yuyang18): Why we cannot invoke forwardActivation here?
+    activation_->forward(arg).check();
   } else /* train and test in train, not generating */ {
     // during training, this layer output value is *Matrix*, which is input of
     // eg. multi-class-cross-entropy
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
index bdf9a4652cc71710d1d33e8b085c5aec28f6f806..99126fdba542bd142341039af27c3af72b391ca7 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
@@ -65,9 +65,10 @@ public:
       : Layer(config), selCols_(nullptr) {}
 
   ~SelectiveFullyConnectedLayer() {}
-  void prefetch();
+  void prefetch() override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
@@ -90,8 +91,8 @@ public:
   void fillSelectiveData(
       const std::shared_ptr<std::vector<std::pair<int*, size_t>>>& candidates);
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 
 private:
   /**
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index 069bc26e602ff7d925b4115d12388b6716676b29..4b24d8f0c852e1bdc887d4ee1465b9ad05d210bb 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -21,9 +21,11 @@ namespace paddle {
 
 /**
  * A layer for concatenating the first sequence with the second sequence
- * following the first
- * Input: two sequences each containing some instances
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
  * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
  */
 
 class SequenceConcatLayer : public Layer {
@@ -35,10 +37,11 @@ public:
 
   ~SequenceConcatLayer() {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqconcat, SequenceConcatLayer);
@@ -167,13 +170,17 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
     size_t rightNumIns = 0;
     for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
       leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-          ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
       offset += leftNumIns;
 
       rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-          ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
       offset += rightNumIns;
     }
   }
diff --git a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
index 4bfce766c769f4be2e5cc7bf691d539b1d307a47..7a13cd7ad0fecf202613d8da365ea832b41ab04e 100644
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -42,12 +42,11 @@ public:
   explicit SequenceLastInstanceLayer(const LayerConfig& config)
       : SequencePoolLayer(config) {}
 
-  ~SequenceLastInstanceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
diff --git a/paddle/gserver/layers/SequencePoolLayer.h b/paddle/gserver/layers/SequencePoolLayer.h
index aa9c132586e55d0f6bccec1689db60145ca2d43f..85b51ccd1dc7e7eb7aa9344b0f7ec6f70a35a0b4 100644
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
@@ -46,12 +46,11 @@ protected:
 public:
   explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
 
-  virtual ~SequencePoolLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 23924b0490851ad3c3c74d77e7abd8b0af8fc234..433592953b220eda4db4634124a57a2074cef4c0 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -20,9 +20,12 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for reshaping the sequence
- * Input: a sequence
- * Output: a sequence
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
  */
 
 class SequenceReshapeLayer : public Layer {
@@ -34,12 +37,11 @@ protected:
 public:
   explicit SequenceReshapeLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SequenceReshapeLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(seqreshape, SequenceReshapeLayer);
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index b678f414b6d76fa26818cb379fb0f0fb8fc7ec09..faf98744a7fdcf9c2c1712d783f153739ccc8eca 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -39,12 +39,11 @@ class SlopeInterceptLayer : public Layer {
 public:
   explicit SlopeInterceptLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SlopeInterceptLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(slope_intercept, SlopeInterceptLayer);
diff --git a/paddle/gserver/layers/SpatialPyramidPoolLayer.h b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
index 32e88cf141a667d9dffbe7dcba46e9fde721f9e7..7d3cb80443801a947e3d529beb002561c4ac1964 100644
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
@@ -43,9 +43,8 @@ protected:
 public:
   explicit SpatialPyramidPoolLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SpatialPyramidPoolLayer() {}
-
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   ProjectionConfig getConfig(size_t sizeX_,
                              size_t sizeY_,
@@ -54,7 +53,7 @@ public:
                              std::string& poolType_);
   size_t getSize();
 
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index c52fbee26232ad6eb09f84315a57c73e6aa02eb0..19b7ad1869af98e6313fe85a40203fd1e84f31d6 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -35,12 +35,11 @@ protected:
 public:
   explicit SubSequenceLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SubSequenceLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(subseq, SubSequenceLayer);
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index aa99b49380d3682ccf3d89220c0c68f22e458271..00f8519550bcff9bb706b1a28dc0dfcdc06cc54a 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -41,12 +41,11 @@ protected:
 public:
   explicit SumToOneNormLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~SumToOneNormLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 
 REGISTER_LAYER(sum_to_one_norm, SumToOneNormLayer);
diff --git a/paddle/gserver/layers/TensorLayer.h b/paddle/gserver/layers/TensorLayer.h
index ac38ffb620570320497446a6825ca2273b73facc..43992f692d3ce40fa095c8e0190bae01dc2ac3c1 100644
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
@@ -44,13 +44,12 @@ protected:
 public:
   explicit TensorLayer(const LayerConfig& config) : Layer(config) {}
 
-  ~TensorLayer() {}
-
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index b43fa1ebfb003226daed724b4ede3006545e8b07..be10bb74f6b218f0b12dc9f20db9a6ee8af7a478 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -20,7 +20,7 @@ limitations under the License. */
 
 namespace paddle {
 /**
- * A layer for transposition.
+ * A layer for transposing a minibatch matrix.
  * \f[
      y = x^\mathrm{T}
  * \f]
@@ -32,9 +32,10 @@ class TransLayer : public Layer {
 public:
   explicit TransLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void forward(PassType passType);
-  void backward(const UpdateCallback& callback = nullptr);
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index 4c1de7b3b7d6975c2693eb065f7d3e19cc51a95c..c8b2634a1366ed03846f2331726d04232b5d32ee 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -26,7 +26,8 @@ class ValidationLayer : public Layer {
 public:
   explicit ValidationLayer(const LayerConfig& config) : Layer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
   LayerPtr getOutputLayer() { return inputLayers_[0]; }
 
@@ -37,13 +38,13 @@ public:
     return inputLayers_[2];
   }
 
-  virtual void forward(PassType passType);
+  void forward(PassType passType) override;
 
-  virtual void backward(const UpdateCallback& callback = nullptr);
+  void backward(const UpdateCallback& callback = nullptr) override;
 
   virtual void validationImp(MatrixPtr outputValue, IVectorPtr label) = 0;
 
-  virtual void onPassEnd() = 0;
+  void onPassEnd() override = 0;
 };
 
 /*
@@ -57,11 +58,12 @@ public:
         cpuLabel_(nullptr),
         cpuWeight_(nullptr) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
   struct PredictionResult {
     PredictionResult(real __out, int __label) : out(__out), label(__label) {}
@@ -86,11 +88,12 @@ public:
   explicit PnpairValidation(const LayerConfig& config)
       : ValidationLayer(config) {}
 
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
 
-  void validationImp(MatrixPtr outputValue, IVectorPtr label);
+  void validationImp(MatrixPtr outputValue, IVectorPtr label) override;
 
-  void onPassEnd();
+  void onPassEnd() override;
 
 private:
   bool passBegin_;
diff --git a/paddle/gserver/layers/WarpCTCLayer.h b/paddle/gserver/layers/WarpCTCLayer.h
index 3d9ae9249af66dd085f5b6bb7a3c09d8b2276a24..7e8d7379d267886805db2eb7983a4dabbf949914 100644
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
@@ -30,9 +30,10 @@ public:
   explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
   ~WarpCTCLayer() {}
 
-  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  virtual void forward(PassType passType);
-  virtual void backward(const UpdateCallback& callback);
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
 protected:
   /**
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c26a2a7f06bc16c113f1812868b5d2b8a5060635..0caa5e1e11e6d42fadfa87149814c4b77b3b6271 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -2,8 +2,7 @@
 
 ################### test_ProtoDataProvider ############
 add_unittest_without_exec(test_ProtoDataProvider
-    test_ProtoDataProvider.cpp
-    TestUtil.cpp)
+    test_ProtoDataProvider.cpp)
 
 # test_ProtoDataProvider will mkdir as same name,
 # so if WORKING_DIRECTORY is default directory, then
@@ -15,53 +14,46 @@ add_test(NAME test_ProtoDataProvider
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
     test_LayerGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_ActivationGrad
     COMMAND test_ActivationGrad)
 ################# test_ConvTrans #######################
 add_unittest_without_exec(test_ConvTrans
     test_ConvTrans.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_ConvTrans
     COMMAND test_ConvTrans)
 ################# test_PriorBox #######################
 add_unittest_without_exec(test_PriorBox
     test_PriorBox.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_PriorBox
     COMMAND test_PriorBox)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
     
 add_test(NAME test_ConvUnify
     COMMAND test_ConvUnify)
 ################# test_BatchNorm #######################
 add_unittest_without_exec(test_BatchNorm
     test_BatchNorm.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_BatchNorm
     COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
-    test_Evaluator.cpp
-    TestUtil.cpp)
+    test_Evaluator.cpp)
 
 ################ test_LinearChainCRF ####################
 add_simple_unittest(test_LinearChainCRF)
@@ -72,8 +64,7 @@ add_simple_unittest(test_MultinomialSampler)
 ############## test_PyDataProvider ########################
 if(WITH_PYTHON)
     add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp
-        TestUtil.cpp)
+        test_PyDataProvider.cpp)
 
     add_test(NAME test_PyDataProvider
         COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
@@ -81,18 +72,15 @@ if(WITH_PYTHON)
 endif()
 
 ############### test_RecurrentLayer #######################
-add_unittest(test_RecurrentLayer
-    test_RecurrentLayer.cpp
-    TestUtil.cpp)
+add_simple_unittest(test_RecurrentLayer)
 
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE)
     add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp
-        TestUtil.cpp)
+        test_WarpCTCLayer.cpp)
 
     add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
         WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 endif()
 
@@ -108,8 +96,7 @@ add_test(NAME test_RecurrentGradientMachine
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
-    test_NetworkCompare.cpp
-    TestUtil.cpp)
+    test_NetworkCompare.cpp)
 if(WITH_GPU)
     add_test(NAME test_NetworkCompare
         COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 57c176810fddf96828c210807673b7d1a3c739c0..7617af10ba719490d1b33dd297b070cd8c7c292c 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
   if (weights) {
     outArgs[0].value->dotMul(*outArgs[0].value, *weights);
   }
-  return Argument::sumCosts(outArgs);
+  return Argument::sum(outArgs);
 }
 
 real getDiffAndPrint(real newCost1,
@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
 
     std::vector<Argument> args;
     args.push_back(out);
-    EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed";
+    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
     for (size_t seqId = 0; seqId < numSequences; ++seqId) {
       start[seqId] += seqLens[seqId];
     }
@@ -310,7 +310,7 @@ void initDataLayer(TestConfig testConf,
         testConf.inputDefs[i].labelSeqStartPositions;
     if (labelSeqStartPositions.size() != 0) {
       CHECK(!sequenceStartPositions);
-      CHECK_GE(labelSeqStartPositions.size(), 2);
+      CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
 
       sequenceStartPositions =
           ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
@@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf,
     outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
   }
 
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
   LOG(INFO) << " cost " << cost;
   EXPECT_FALSE(std::isnan(cost));
 
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 4e88ac0e81ef2596f14995be53f7c5c20ddba2d7..9f68eb64d0b4ad27306d3b20387d74a7e438d910 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 using namespace std;  // NOLINT
 
 namespace paddle {
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 7d7e68da5c5a9dbcba024002a988f26f7613b724..b201ba8a5a4146ab28cd96454f434f889d72a968 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 7f5fcb670b70aed9f0a04180d344556a0390122f..d07299bfe3c4147742384a45dc6f1698d9c382f4 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -114,8 +114,8 @@ TEST(Layer, batchNorm) {
   bnLayer->forward(PASS_GC);
   convLayer->forward(PASS_GC);
 
-  CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
-  CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index dd3378304b433c135881310eb89273b6bf492af2..40bb1e2d73c81280a9b12114c13de851285c276b 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index ad99b50245cf56eb7db227fa582f6e3f41b47a7a..207fc0566fcf4a0d2e971f3c169a14a64146155b 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index e07066dad84aa6326c2447fc5ee80fa496735fbf..4f5fdbb37ce024e18b8d39c5dda74c69bf82166a 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
@@ -110,6 +110,18 @@ void testEvaluator(TestConfig testConf,
   testEvaluator->finish();
   LOG(INFO) << *testEvaluator;
 
+  std::vector<std::string> names;
+  testEvaluator->getNames(&names);
+  paddle::Error err;
+  for (auto& name : names) {
+    auto value = testEvaluator->getValue(name, &err);
+    ASSERT_TRUE(err.isOK());
+    LOG(INFO) << name << " " << value;
+    auto tp = testEvaluator->getType(name, &err);
+    ASSERT_TRUE(err.isOK());
+    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
+  }
+
   double totalScore2 = 0.0;
   if (testConf.testAccumulate) {
     testEvaluator->start();
@@ -129,6 +141,7 @@ void testEvaluatorAll(TestConfig testConf,
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
+  config.evaluatorConfig.set_top_k(5);
 
   config.inputDefs.push_back({INPUT_DATA, "output", 50});
   config.inputDefs.push_back({INPUT_LABEL, "label", 50});
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 2cc25f6b211e367fc82c07c30082c3e12c04e53d..14d9db52470b2828186eca04d303135910489266 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -310,7 +310,11 @@ TEST(Layer, CTCLayer) {
   config.layerConfig.add_inputs();
 
   for (auto useGpu : {false, true}) {
-    testLayerGrad(config, "ctc", 100, /* trans */ false, /* useGpu */ useGpu);
+    testLayerGrad(config,
+                  "ctc",
+                  100,
+                  /* trans */ false, /* useGpu */
+                  useGpu);
   }
 }
 
@@ -587,7 +591,11 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
 
   // Not support GPU now
-  testLayerGrad(config, "hsigmoid", 100, /* trans */ false, /* useGpu */ false);
+  testLayerGrad(config,
+                "hsigmoid",
+                100,
+                /* trans */ false, /* useGpu */
+                false);
 }
 
 TEST(Layer, multi_cross) {
@@ -1022,8 +1030,12 @@ void testNormLayer(const string& normType, bool trans, bool useGpu) {
 }
 
 TEST(Layer, NormLayer) {
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ true);
-  testNormLayer("cmrnorm-projection", /* trans= */ false, /* useGpu= */ false);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                true);
+  testNormLayer("cmrnorm-projection",
+                /* trans= */ false, /* useGpu= */
+                false);
 }
 
 void setPoolConfig(TestConfig* config,
@@ -1304,6 +1316,25 @@ TEST(Layer, ResizeLayer) {
   }
 }
 
+TEST(Layer, RotateLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("rotate");
+  const int CHANNEL = 2;
+  const int HEIGHT = 8;
+  const int WIDTH = 4;
+  const int INPUT_SIZE = HEIGHT * WIDTH * CHANNEL;
+  config.layerConfig.set_size(INPUT_SIZE);
+  config.layerConfig.set_height(HEIGHT);
+  config.layerConfig.set_width(WIDTH);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", INPUT_SIZE, 0});
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "rotate", 100, false, useGpu);
+  }
+}
+
 TEST(Layer, NCELayer) {
   TestConfig config;
   size_t numClasses = 4;
@@ -1563,6 +1594,35 @@ TEST(Layer, MultiplexLayer) {
   }
 }
 
+TEST(Layer, PadLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("pad");
+
+  int c = 4;
+  int h = 31;
+  int w = 36;
+  size_t size = c * h * w;
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  PadConfig* pad = input->mutable_pad_conf();
+  ImageConfig* image = pad->mutable_image_conf();
+
+  image->set_channels(c);
+  image->set_img_size(h);
+  image->set_img_size_y(w);
+  pad->add_pad_c(1);
+  pad->add_pad_c(2);
+  pad->add_pad_h(2);
+  pad->add_pad_h(3);
+  pad->add_pad_w(3);
+  pad->add_pad_w(5);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "pad", 10, false, useGpu);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index 330adee8f77f495dab6a13190aaca6a3a5f86b2c..f046cb0b289c9ce22b98f3200bf0a3f7d48d77f5 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 0d261059555c971cd509e64802d6c70abc9d2fef..4db30f37a5bc92d4348caed0aebdd8a589b55712 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/Stat.h"
 
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index a6d6a242696633e66a05bf9fc9eee81a468ed056..ae0e3bc3d24c54eb84c7b5f5053e629607ef4310 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index d421b6e2f2536e266883508ff29cbec731c9d7e3..e11bf402c27898b8fdbd3fceeb8aeff8906352db 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;  // NOLINT
 
@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
     }        // end for (int numIdSlots : numSlotsArray)
   }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 0f264ecf91837f6681f0577b93be7e35be268c04..db883543c306c1938eb9da188ce20ed768018efb 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/PyDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;     // NOLINT
 using namespace paddle;  // NOLINT
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 5f8bc5ecd0f77efc6dcda0330f124ca6cab7f277..7e193eb31a03e6a6b8b0b02e89608a0e02b9e248 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -293,7 +293,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   while (true) {
     int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
-      CHECK_LE(realBatchSize, batchSize);
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
     } else {
       break;
     }
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index f91c788863b6963df92b735dbfef2bacee1fff45..16ab0e6aecb6a895b20389992a44dc542eb3b00a 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/Layer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 0a4a814d5247410248f7418e1ef2c79a2da42507..55427e2f12fd7b77c6eea1f65b3229e6fd29d71d 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/WarpCTCLayer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -148,11 +148,11 @@ LayerPtr createCTCLayer(string name,
 
   ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
 
-  softmaxActivation->forward(dataLayer->getOutput());
+  softmaxActivation->forward(dataLayer->getOutput()).check();
   layer->forward(PASS_GC);
 
   layer->backward();
-  softmaxActivation->backward(dataLayer->getOutput());
+  softmaxActivation->backward(dataLayer->getOutput()).check();
 
   return layer;
 }
@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8f9bc9e823eb8062535920361899ce3cc06ec3a7..8691c87ac3b88499a9676d59af533e0f4713dfc3 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "TensorExpression.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 82a482f701481267e564c7ad8179689deb65a75b..bf62229c03bb1d6e2bdf86d8c56a8157938fb832 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -372,7 +372,7 @@ MatrixPtr CpuSparseMatrix::subMatrix(size_t startRow, size_t numRows) {
 }
 
 /* mem MUST be alloced outside (memAlloc=false) */
-void CpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   CHECK(!memAlloc);
   CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(matTrans.get());
   if (format_ == SPARSE_CSR) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index d3e8871cb5b320ce420d601bde7f18d85398dde7..860cad1047fc343b13efa901186ea218d0855151 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -201,7 +201,7 @@ public:
   void zeroMem();
 
   /// mem MUST be alloced outside (memAlloc=false)
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
 
   void mul(const Matrix& A, const Matrix& B, real alpha, real beta);
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 50d2e3eb671028c8169321fcd85fe25735c11a14..07450bfb0ef709840f7e8253e87c227276529a2a 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -274,6 +274,18 @@ real GpuMatrix::getSum() {
   return sum;
 }
 
+real GpuMatrix::getMin() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMin();
+}
+
+real GpuMatrix::getMax() {
+  CHECK(isContiguous());
+  auto vec = GpuVector(height_ * width_, data_);
+  return vec.getMax();
+}
+
 void GpuMatrix::accumulateColSum(Matrix& src) {
   CHECK_EQ(getWidth(), src.getWidth());
   CHECK_EQ(getHeight(), (size_t)1);
@@ -371,11 +383,13 @@ MatrixPtr GpuMatrix::getTranspose() {
   }
 }
 
-void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   if (memAlloc) {
     matTrans = std::make_shared<GpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -385,13 +399,27 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
+void GpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<GpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+
+  real* dataRot = matRot->getData();
+  real* data = getData();
+  hl_matrix_rotate(data, dataRot, height_, width_, clockWise);
+}
+
 MatrixPtr GpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
   return matInv;
 }
 
-void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void GpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(height_, width_);
 
   if (memAlloc) {
@@ -704,6 +732,7 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   hl_matrix_top_k(maxVal.getData(),
                   maxVal.getStride(),
@@ -764,19 +793,32 @@ void GpuMatrix::maxoutBackward(Matrix& a,
 }
 
 /*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output, IVector& label) {
-  auto output_ptr = dynamic_cast<const GpuMatrix*>(&output);
-  auto label_ptr = dynamic_cast<const GpuIVector*>(&label);
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == 1)
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
       << "Matrix dimensions are not equal";
 
-  hl_matrix_classification_error((real*)output_ptr->data_,
-                                 (int*)label_ptr->getData(),
-                                 data_,
-                                 height_,
-                                 output_ptr->width_);
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -913,59 +955,6 @@ void GpuMatrix::softreluDerivative(Matrix& output) {
 void GpuMatrix::scaledTanh(Matrix& output, real p1, real p2) {
   BaseMatrix::scaledTanh(output, p1, p2);
 }
-void GpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  CHECK(output1.useGpu_ == true && output2.useGpu_ == true)
-      << "Matrix type are not equal";
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-  real* out = getData();
-  real* x = output1.getData();
-  real* y = output2.getData();
-  hl_cossim(out, x, y, dim, output1.getHeight(), output2.getHeight(), scale);
-}
-void GpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == true && prevOut1.useGpu_ == true &&
-        prevOut2.useGpu_ == true && prevGrad1.useGpu_ == true &&
-        prevGrad2.useGpu_ == true)
-      << "Matrix type are not equal";
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  real* grad = getData();
-  real* out = output.getData();
-  real* prevOutX = prevOut1.getData();
-  real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  hl_cossim_derivative(grad,
-                       out,
-                       prevOutX,
-                       prevOutY,
-                       prevGradX,
-                       prevGradY,
-                       dim,
-                       prevOut1.getHeight(),
-                       prevOut2.getHeight(),
-                       scale);
-}
 
 void GpuMatrix::randomizeUniform() {
   CHECK(isContiguous());
@@ -1304,68 +1293,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input.getWidth() * contextLength);
-
-  hl_context_projection_forward(input.getData(),
-                                sequence.getData(),
-                                isPadding ? weight->getData() : NULL,
-                                getData(),
-                                sequence.getSize() - 1,
-                                input.getWidth(),
-                                contextLength,
-                                contextStart,
-                                beginPad,
-                                isPadding);
-}
-
-void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
-                                              const IVector& sequence,
-                                              int contextLength,
-                                              int contextStart) {
-  CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
-
-  hl_context_projection_backward_data(getData(),
-                                      sequence.getData(),
-                                      inputGrad.getData(),
-                                      sequence.getSize() - 1,
-                                      inputGrad.getWidth(),
-                                      contextLength,
-                                      contextStart);
-}
-
-void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
-                                                const IVector& sequence,
-                                                int contextLength,
-                                                int contextStart,
-                                                int totalPad,
-                                                size_t beginPad) {
-  CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
-
-  hl_context_projection_backward_weight(getData(),
-                                        sequence.getData(),
-                                        weightGrad.getData(),
-                                        sequence.getSize() - 1,
-                                        weightGrad.getWidth(),
-                                        totalPad,
-                                        contextLength,
-                                        contextStart,
-                                        beginPad);
-}
-
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   CHECK(data.useGpu_ == true && W.useGpu_ == true)
       << "Matrix type are not equal";
@@ -1373,7 +1300,9 @@ void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   real* output = getData();
   hl_param_relu_forward(output, input, w, numElements, numSamples, partial_sum);
 }
@@ -1386,7 +1315,9 @@ void GpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_w(
       wgrad, ograd, input, numElements, numSamples, partial_sum);
 }
@@ -1398,7 +1329,9 @@ void GpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   hl_param_relu_backward_diff(
       ograd, input, w, diff, numElements, numSamples, partial_sum);
 }
@@ -1746,11 +1679,13 @@ MatrixPtr CpuMatrix::getTranspose() {
   }
 }
 
-void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void CpuMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   if (memAlloc) {
     matTrans = std::make_shared<CpuMatrix>(width_, height_);
   } else {
     CHECK(matTrans != NULL);
+    CHECK_EQ(matTrans->getHeight(), width_);
+    CHECK_EQ(matTrans->getWidth(), height_);
   }
   real* dataTrans = matTrans->getData();
   real* data = getData();
@@ -1764,13 +1699,35 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
+void CpuMatrix::rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
+  if (memAlloc) {
+    matRot = std::make_shared<CpuMatrix>(width_, height_);
+  } else {
+    CHECK(matRot != NULL);
+    CHECK_EQ(matRot->getHeight(), width_);
+    CHECK_EQ(matRot->getWidth(), height_);
+  }
+  real* dataRot = matRot->getData();
+  real* data = getData();
+
+  for (size_t i = 0; i < height_; i++) {
+    for (size_t j = 0; j < width_; j++) {
+      if (clockWise) {
+        dataRot[j * height_ + i] = data[(height_ - i - 1) * width_ + j];
+      } else {
+        dataRot[j * height_ + i] = data[i * width_ + (width_ - j - 1)];
+      }
+    }
+  }
+}
+
 MatrixPtr CpuMatrix::getInverse() {
   MatrixPtr matInv;
   inverse(matInv, true);
   return matInv;
 }
 
-void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(height_, width_);
 
   if (memAlloc) {
@@ -2203,113 +2160,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   }
 }
 
-void CpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
-  auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
-  CHECK(input_ptr && seq_ptr);
-  if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
-
-  const int* starts = seq_ptr->getData();
-  size_t numSequences = seq_ptr->getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i], padSize);
-        if (isPadding) {
-          MatrixPtr sub = weight->subMatrix(j, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-        if (isPadding) {
-          MatrixPtr sub =
-              weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      dst->addAtOffset(*src, j * input_ptr->getWidth());
-    }
-  }
-}
-
-void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
-                                          Matrix* weightGrad,
-                                          const IVector& sequence,
-                                          int contextLength,
-                                          int contextStart,
-                                          size_t beginPad,
-                                          bool isPadding) {
-  if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
-  if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-
-  int64_t inputDim = inputGrad ? inputGrad->getWidth()
-                               : weightGrad ? weightGrad->getWidth() : 0;
-  CHECK_EQ(getWidth(), inputDim * contextLength);
-
-  const int* starts = sequence.getData();
-  size_t numSequences = sequence.getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i], padSize);
-          MatrixPtr sub = weightGrad->subMatrix(j, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-          MatrixPtr sub = weightGrad->subMatrix(
-              beginPad + contextStart + j - padSize, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!inputGrad) continue;
-      MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      src->addAtOffset(*dst, j * inputDim);
-    }
-  }
-}
-
 inline void vecAddTo(real* a, const real* b, size_t len) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i] += b[i];
@@ -3203,7 +3053,7 @@ void CpuMatrix::rowMax(Matrix& max) {
   max.maxRows(*this);
 }
 
-/* get beam size of max ids and values */
+/* Get the top k elements of each row of this matrix */
 void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK(isContiguous());
   CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
@@ -3211,6 +3061,7 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   real* a = getData();
   int* s = maxIds.getData();
@@ -3362,32 +3213,39 @@ void CpuMatrix::rowNormalizeL1(Matrix& out) {
 }
 
 /* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&output));
-  CHECK(dynamic_cast<const CpuIVector*>(&label));
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
 
-  CHECK_EQ(getWidth(), (size_t)1);
-  size_t numSamples = getHeight();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
 
-  size_t dim = output.getWidth();
-  real* out = output.getData();
-  int* lbl = label.getData();
-  real maxData = 0.0;
-  int maxIndex = -1;
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
   for (size_t i = 0; i < numSamples; ++i) {
     CHECK_GE(lbl[i], 0);
     CHECK_LT((size_t)lbl[i], dim);
-    maxData = out[i * dim];
-    maxIndex = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (maxData < out[i * dim + j]) {
-        maxIndex = j;
-        maxData = out[i * dim + j];
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
       }
+      result[i] = 1.0f;
     }
-    getData()[i] = (maxIndex != lbl[i]);
   }
 }
 
@@ -3581,105 +3439,6 @@ void CpuMatrix::softmaxDerivative(Matrix& output, Matrix& sftmaxSum) {
   }
 }
 
-void CpuMatrix::cosSim(Matrix& output1, Matrix& output2, real scale) {
-  size_t numSamples = getHeight();
-  size_t dim = output1.getWidth();
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output1.getHeight(), numSamples);
-  CHECK_EQ(output1.getWidth(), output2.getWidth());
-
-  real* out = getData();
-  const real* x = output1.getData();
-  const real* y = output2.getData();
-  size_t yInc = dim;
-  if (output2.getHeight() == 1LU) {
-    yInc = 0;
-  } else {
-    CHECK_EQ(output2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i, x += dim, y += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(x[j]);
-      squareSumY += _square(y[j]);
-      xy += x[j] * y[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    out[i] = scale * xy / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-  }
-}
-
-void CpuMatrix::cosSimDerivative(Matrix& output,
-                                 Matrix& prevOut1,
-                                 Matrix& prevOut2,
-                                 Matrix& prevGrad1,
-                                 Matrix& prevGrad2,
-                                 real scale) {
-  CHECK(output.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(getWidth(), 1UL);
-  CHECK_EQ(output.getWidth(), 1UL);
-
-  size_t numSamples = getHeight();
-  CHECK_EQ(output.getHeight(), numSamples);
-  CHECK_EQ(prevOut1.getHeight(), numSamples);
-  CHECK_EQ(prevGrad1.getHeight(), numSamples);
-
-  size_t dim = prevOut1.getWidth();
-  CHECK_EQ(prevOut2.getWidth(), dim);
-  CHECK_EQ(prevGrad1.getWidth(), dim);
-  CHECK_EQ(prevGrad2.getWidth(), dim);
-
-  const real* grad = getData();
-  const real* out = output.getData();
-  const real* prevOutX = prevOut1.getData();
-  const real* prevOutY = prevOut2.getData();
-  real* prevGradX = prevGrad1.getData();
-  real* prevGradY = prevGrad2.getData();
-  size_t yInc = dim;
-  if (prevOut2.getHeight() == 1LU) {
-    yInc = 0;
-    CHECK_EQ(prevGrad2.getHeight(), 1LU);
-  } else {
-    CHECK_EQ(prevOut2.getHeight(), numSamples);
-    CHECK_EQ(prevGrad2.getHeight(), numSamples);
-  }
-  for (size_t i = 0; i < numSamples; ++i,
-              prevOutX += dim,
-              prevOutY += yInc,
-              prevGradX += dim,
-              prevGradY += yInc) {
-    real squareSumX = 0;
-    real squareSumY = 0;
-    real xy = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      squareSumX += _square(prevOutX[j]);
-      squareSumY += _square(prevOutY[j]);
-      xy += prevOutX[j] * prevOutY[j];
-    }
-    CHECK(squareSumX > 0 && squareSumY > 0);
-    if (xy == 0) {
-      real reciprocal = 1.0f / (std::sqrt(squareSumX) * std::sqrt(squareSumY));
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += scale * grad[i] * prevOutY[j] * reciprocal;
-        prevGradY[j] += scale * grad[i] * prevOutX[j] * reciprocal;
-      }
-    } else {
-      real reciprocalXY = 1.0f / xy;
-      real reciprocalSquareSumX = 1.0f / squareSumX;
-      real reciprocalSquareSumY = 1.0f / squareSumY;
-      for (size_t j = 0; j < dim; ++j) {
-        prevGradX[j] += out[i] * grad[i] * (prevOutY[j] * reciprocalXY -
-                                            prevOutX[j] * reciprocalSquareSumX);
-        prevGradY[j] += out[i] * grad[i] * (prevOutX[j] * reciprocalXY -
-                                            prevOutY[j] * reciprocalSquareSumY);
-      }
-    }
-  }
-}
-
 void CpuMatrix::sumOfSquares(Matrix& output, Matrix& label) {
   CHECK(output.useGpu_ == false && label.useGpu_ == false)
       << "Matrix type are not equal";
@@ -3933,7 +3692,9 @@ void CpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       data_[k] = input[k] > 0 ? input[k] : input[k] * w[i / partial_sum];
@@ -3947,7 +3708,9 @@ void CpuMatrix::paramReluBackwardW(Matrix& oGrad, Matrix& data) {
   real* wgrad = data_;
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (this->getHeight() * this->getWidth());
+  size_t paraSize = this->getHeight() * this->getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       wgrad[i / partial_sum] += ograd[k] * (input[k] > 0 ? 0 : input[k]);
@@ -3962,7 +3725,9 @@ void CpuMatrix::paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
   real* w = W.getData();
   size_t numElements = data.getWidth();
   size_t numSamples = data.getHeight();
-  size_t partial_sum = numElements / (W.getHeight() * W.getWidth());
+  size_t paraSize = W.getHeight() * W.getWidth();
+  CHECK(!(numElements % paraSize));  // this check from ParameterReluLayer::init
+  size_t partial_sum = numElements / paraSize;
   for (size_t n = 0, k = 0; n < numSamples; ++n) {
     for (size_t i = 0; i < numElements; ++i, ++k) {
       diff[k] += ograd[k] * (input[k] > 0 ? 1 : w[i / partial_sum]);
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index bda863de38675fe481544a7e82b69f445df361bd..d0ba2e93feabfcc11ac1d261bc40c9c6973a8c29 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -26,11 +26,12 @@ limitations under the License. */
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
 #include "Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
+/// TODO(tianbing), move to paddle/function/TensorType.h
 enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
 
 /**
@@ -56,6 +57,7 @@ enum SparseValueType { NO_VALUE = 0, FLOAT_VALUE = 1 };
  *            value [1, 1, 2, 2, 5]
  * @endcode
  */
+/// TODO(tianbing), move to paddle/function/TensorType.h
 enum SparseFormat { SPARSE_CSR = 0, SPARSE_CSC = 1 };
 
 class Matrix;
@@ -370,7 +372,27 @@ public:
    * allocate matTrans' memory outside, then set memAlloc as false;
    * else set as true.
    */
-  virtual void transpose(MatrixPtr matTrans, bool memAlloc) {
+  virtual void transpose(MatrixPtr& matTrans, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  rotate 90 degrees in clock-wise if clockWise=true;
+   *         otherwise rotate in anti clock-wise
+   * clock-wise:
+   * \f[
+   *   y(j,i) = x(M-i-1,j)
+   * \f]
+   * anti clock-wise:
+   * \f[
+   *   y(j,i) = x(i, N-1-j)
+   * \f]
+   * where \f$x\f$ is (M x N) input, and \f$y\f$ is (N x M) output.
+   *
+   * allocate matRot' memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -385,7 +407,7 @@ public:
    * if allocate matInv's memory outside, then set memAlloc as false;
    * else set as true.
    */
-  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+  virtual void inverse(MatrixPtr& matInv, bool memAlloc) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -777,26 +799,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * cosine similarity, for each row i,
-   *   this[i] = cos(output1[i], output2[i])
-   *
-   * output2 can only have one row, then for each row i,
-   *   this[i] = cos(output1[i], output2[0])
-   */
-  virtual void cosSim(Matrix& output1, Matrix& output2, real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
-  virtual void cosSimDerivative(Matrix& output,
-                                Matrix& prevOut1,
-                                Matrix& prevOut2,
-                                Matrix& prevGrad1,
-                                Matrix& prevGrad2,
-                                real scale = 1.0f) {
-    LOG(FATAL) << "Not implemented";
-  }
-
   /// print out the values of elements to os
   virtual void print(std::ostream& os) const {
     LOG(FATAL) << "Not implemented";
@@ -834,8 +836,11 @@ public:
    * output[i] = 1 if row i is an error.
    *
    * output[i] = 0 if row i is correct.
+   *
    */
-  virtual void classificationError(Matrix& output, IVector& label) {
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -972,42 +977,6 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(Matrix& input,
-                                        Matrix* weight,
-                                        const IVector& sequence,
-                                        int contextLength,
-                                        int contextStart,
-                                        size_t beginPad,
-                                        bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackward(Matrix* inputGrad,
-                                         Matrix* weightGrad,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardData(Matrix& inputGrad,
-                                             const IVector& sequence,
-                                             int contextLength,
-                                             int contextStart) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                               const IVector& sequence,
-                                               int contextLength,
-                                               int contextStart,
-                                               int totalPad,
-                                               size_t beginPad) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1127,6 +1096,10 @@ public:
       TensorCpuApply<real>(*this, expr);
     }
   }
+
+  bool isEmpty() const { return data_ == nullptr; }
+
+  explicit operator bool() const { return !isEmpty(); }
 };
 
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1199,11 +1172,15 @@ public:
   void accumulateColSum(Matrix& src);
   real getAbsSum();
 
+  real getMin();
+  real getMax();
+
   MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
 
   MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
 
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
@@ -1330,14 +1307,6 @@ public:
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   virtual void print(std::ostream& os) const;
   virtual void print(std::ostream& os, size_t height, size_t width) const;
 
@@ -1348,7 +1317,7 @@ public:
   void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void convExpand(Matrix& feature,
                   int feaImgHeight,
@@ -1442,26 +1411,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackwardData(Matrix& inputGrad,
-                                     const IVector& sequence,
-                                     int contextLength,
-                                     int contextStart);
-
-  void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                       const IVector& sequence,
-                                       int contextLength,
-                                       int contextStart,
-                                       int totalPad,
-                                       size_t beginPad);
-
   void bilinearForward(const Matrix& in,
                        const size_t inImgH,
                        const size_t inImgW,
@@ -1535,10 +1484,11 @@ public:
   real getAbsSum();
 
   MatrixPtr getTranspose();
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
+  void rotate(MatrixPtr& matRot, bool memAlloc, bool clockWise);
 
   MatrixPtr getInverse();
-  void inverse(MatrixPtr matInv, bool memAlloc);
+  void inverse(MatrixPtr& matInv, bool memAlloc);
 
   void copyFrom(const Matrix& src);
 
@@ -1648,22 +1598,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackward(Matrix* inputGrad,
-                                 Matrix* weightGrad,
-                                 const IVector& sequence,
-                                 int contextLength,
-                                 int contextStart,
-                                 size_t beginPad,
-                                 bool isPadding);
-
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
@@ -1793,14 +1727,6 @@ public:
   void softreluDerivative(Matrix& output);
   void scaledTanh(Matrix& output, real p1, real p2);
 
-  void cosSim(Matrix& output1, Matrix& output2, real scale);
-  void cosSimDerivative(Matrix& output,
-                        Matrix& prevOut1,
-                        Matrix& prevOut2,
-                        Matrix& prevGrad1,
-                        Matrix& prevGrad2,
-                        real scale);
-
   void print(std::ostream& os) const;
   void print(std::ostream& os, size_t height, size_t width) const;
   void printOneRow(std::ostream& os, size_t idx) const;
@@ -1816,7 +1742,7 @@ public:
 
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
diff --git a/paddle/math/RowBuffer.h b/paddle/math/RowBuffer.h
new file mode 100644
index 0000000000000000000000000000000000000000..dbb829c4e24a659e4a97c0a3ba4c5c78b68815d3
--- /dev/null
+++ b/paddle/math/RowBuffer.h
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <vector>
+#include "MemoryHandle.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * @brief The RowBuffer class
+ * Represent the SparseRow Matrix Data.
+ *
+ * If not set memory handler, then the data could be auto growth.
+ */
+class RowBuffer {
+public:
+  /**
+   * @brief RowBuffer create a auto-growth row buffer. The row length is width.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  explicit RowBuffer(size_t width) : width_(width) {}
+
+  /**
+   * @brief RowBuffer create a row buffer, which cannot be auto-growth.
+   * @param mem the pre-allocated memory.
+   * @param width the length of each row, a.k.a matrix width.
+   */
+  RowBuffer(const CpuMemHandlePtr& mem, size_t width)
+      : preallocatedBuf_(mem), width_(width) {}
+
+  /**
+   * @brief resize resize the buffer with rowCount
+   * @param rowCnt number of row. matrix height.
+   */
+  inline void resize(int rowCnt) {
+    if (preallocatedBuf_) {
+      CHECK(preallocatedBuf_->getSize() >= rowCnt * width_ * sizeof(real));
+    } else {
+      rowStore_.resize(rowCnt * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* get(int row) const {
+    if (preallocatedBuf_) {
+      CHECK_LE((row + 1) * width_ * sizeof(real), preallocatedBuf_->getSize());
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf()) + row * width_;
+    } else {
+      CHECK_LE((row + 1) * width_, rowStore_.size());
+      return const_cast<real*>(rowStore_.data() + row * width_);
+    }
+  }
+
+  /**
+   * @brief get a row buffer with row index. If row index is larger than local
+   *        buffer, the size of local buffer will grow.
+   * @param row the index of row.
+   * @return row buffer.
+   */
+  inline real* getWithAutoGrowth(int row) {
+    if (preallocatedBuf_) {
+      return get(row);
+    } else {
+      if ((rowStore_.size() <= row * width_)) {
+        rowStore_.resize((row + 1) * width_);
+      }
+      return rowStore_.data() + row * width_;
+    }
+  }
+
+  /**
+   * @return raw data buffer.
+   */
+  inline real* data() {
+    if (preallocatedBuf_) {
+      return reinterpret_cast<real*>(preallocatedBuf_->getBuf());
+    } else {
+      return rowStore_.data();
+    }
+  }
+
+  /**
+   * @brief clear local buffer. It only affect auto-growth buffer.
+   */
+  inline void clear() { rowStore_.clear(); }
+
+  /**
+   * @brief get current number of rows.
+   * @return number of rows.
+   */
+  inline size_t getRowCount() const {
+    if (preallocatedBuf_) {
+      return preallocatedBuf_->getSize() / sizeof(real) / width_;
+    } else {
+      return rowStore_.size() / width_;
+    }
+  }
+
+  /**
+   * @brief get is this buffer can automatically grow or not.
+   * @return ture if can automacitally grow.
+   */
+  inline bool isAutoGrowth() const { return !preallocatedBuf_; }
+
+  /**
+   * @brief return the width of matrix. a.k.a length of row.
+   * @return width of matrix
+   */
+  inline size_t getWidth() const { return width_; }
+
+private:
+  //! TODO(yuyang18): Add resize method to CpuMemHandlePtr, then we can get rid
+  //! of std::vector here.
+  CpuMemHandlePtr preallocatedBuf_;
+  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  size_t width_;
+};
+}  // namespace paddle
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index 720a035ecbd26df01fe24c991982bbf7965ccbdc..6370c77386688a334fa0de8b4e2b272882e9e2b0 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -177,7 +177,6 @@ GpuSparseMatrix::GpuSparseMatrix(real* value,
       hl_sparse_matrix_s_ptr tmp2(tmp, hl_destruct_sparse_matrix);
       sMatrix_ = tmp2;
     }
-    LOG(INFO) << "weight to matrix ";
   }
 }
 
@@ -498,7 +497,7 @@ void GpuSparseMatrix::setRow(size_t row,
 
 SparseValueType GpuSparseMatrix::getValueType() const { return valueType_; }
 
-void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
+void GpuSparseMatrix::transpose(MatrixPtr& matTrans, bool memAlloc) {
   CHECK_EQ(format_, SPARSE_CSC);
   int nnz = sMatrix_->nnz;
   if (memAlloc) {
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index 1d3801548e03a6ae679afb15bf7f620172d61c57..f6cd5df338965b55ca17636de097d2401dc057f9 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -109,7 +109,7 @@ public:
   MatrixPtr getTranspose();
 
   /// B = A'
-  void transpose(MatrixPtr matTrans, bool memAlloc);
+  void transpose(MatrixPtr& matTrans, bool memAlloc);
 
   void copyFrom(const Matrix& src);
   void copyFrom(const Matrix& src, hl_stream_t stream);
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index b61c6b2d49ccead5e9cfdf595a8bebae0e5b87b5..b8c781ca1fd46c9840817abe26a20eec005c37e9 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,10 +24,6 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-DEFINE_bool(allow_inefficient_sparse_update,
-            false,
-            "Whether to allow inefficient sparse update");
-
 namespace paddle {
 
 const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 778a9bd845661849261b52dcbeb519809d0c6306..1ccbf97b25922ae52377d7048da3a07012d21003 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -18,10 +18,9 @@ limitations under the License. */
 #include <string.h>
 #include <algorithm>
 #include "Matrix.h"
+#include "RowBuffer.h"
 #include "paddle/utils/Util.h"
 
-DECLARE_bool(allow_inefficient_sparse_update);
-
 namespace paddle {
 
 /**
@@ -45,12 +44,9 @@ public:
                      IndexDictPtr indexDictHandle = nullptr,
                      bool trans = false)
       : CpuMatrix(nullptr, height, width, trans),
-        storeMat_(dataHandle,
-                  dataHandle ? dataHandle->getSize() / sizeof(real) / width : 0,
-                  width,
-                  trans),
         indexDictHandle_(indexDictHandle) {
     init(height, width);
+    buf_.reset(new RowBuffer(dataHandle, width));
   }
 
   virtual ~SparseRowCpuMatrix() {}
@@ -71,25 +67,16 @@ public:
    *
    *  @param row row id in local storage
    */
-  real* getLocalRow(size_t row) {
-    if (storeMat_.getData()) return storeMat_.rowBuf(row);
-    if (rowStore_.size() <= row * width_) {
-      rowStore_.resize((row + 1) * width_);
-    }
-    return rowStore_.data() + row * width_;
-  }
+  real* getLocalRow(size_t row) { return buf_->getWithAutoGrowth(row); }
 
   /**
-   *  reserve the storage for rows according to current size of indexDictHandle.
+   *  reserve the storage for rows according to current size of
+   * indexDictHandle.
    *
    *  This is only used when SparseRowCpuMatrix is constructed with
    *  indexDictHandle.
    */
-  void reserveStore() {
-    if (!storeMat_.getData() && !localIndices_->empty()) {
-      rowStore_.resize(localIndices_->size() * width_);
-    }
-  }
+  void reserveStore() { buf_->resize(localIndices_->size()); }
 
   // row is the row id in the original matrix
   virtual real* getRowBuf(size_t row) { return getRow(row); }
@@ -117,7 +104,8 @@ public:
    *
    * If L1 decay set use L1, else if L2 set use L2, otherwise no decay atall.
    *
-   * t0 is a int vector used by L1/L2 decay, size = height of parameter matrix,
+   * t0 is a int vector used by L1/L2 decay, size = height of parameter
+   * matrix,
    * store the time that each weight row last updated.
    *
    * Time is batchId, currentTime is current batchId.
@@ -176,8 +164,7 @@ public:
 protected:
   template <typename Func>
   void apply(Func f) {
-    real* data = storeMat_.getData() ? storeMat_.getData() : rowStore_.data();
-    f(data, localIndices_->size() * width_);
+    f(buf_->data(), localIndices_->size() * width_);
   }
 
   void init(size_t height, size_t width);
@@ -188,25 +175,23 @@ protected:
       globalIndices_[id] = kUnusedId_;
     }
     localIndices_->clear();
-    rowStore_.clear();
+    buf_->clear();
   }
 
   inline void checkStoreSize() {
-    if (storeMat_.getData()) {
-      CHECK_LE(localIndices_->size(), storeMat_.getHeight());
-    } else if (!FLAGS_allow_inefficient_sparse_update) {
-      if (localIndices_->size() > 0.5 * height_) {
-        LOG(WARNING)
-            << "There are more than 0.5*height (" << localIndices_->size()
-            << ") rows are used for sparse "
-            << "update, which is not efficient. Considering not use "
-            << "sparse_update or set --allow_inefficient_sparse_update=true";
+    if (buf_->isAutoGrowth()) {
+      if (buf_->getRowCount() > 0.5 * height_) {
+        LOG(WARNING) << "There are more than 0.5*height ("
+                     << localIndices_->size() << ") rows are used for sparse "
+                     << "update, which is not efficient. Considering not use "
+                     << "sparse_update.";
       }
+    } else {
+      CHECK_LE(localIndices_->size(), buf_->getRowCount());
     }
   }
 
-  CpuMatrix storeMat_;
-  std::vector<real, AlignedAllocator<real, 32>> rowStore_;
+  std::unique_ptr<RowBuffer> buf_;
   IndexDictPtr indexDictHandle_;
   std::vector<unsigned int>* localIndices_;  // =&indexDictHandle_->localIndices
   unsigned int* globalIndices_;  // =indexDictHandle_->globalIndices.data();
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index f3d60e400380f7d7d645559318837b0d7706661d..6fd60e7f3c65ea8e31fd1aaaa61b6ad8956ff1cd 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "hl_tensor_ops.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Logging.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index b4347a70f874a2a1bf933bbea4d1b15385f36090..9af6e30c9e13895ad95653a787ec1c1ad77a248f 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Thread.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index a3ea078509704f305672d0b02d272de0f6c97f51..ceb96b2e250d8e04ffb2b1d8c77ad498dca91cf3 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -4,11 +4,11 @@ add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
 add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
+add_simple_unittest(test_RowBuffer)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
-    test_matrixCompare.cpp
-    ../../gserver/tests/TestUtil.cpp)
+    test_matrixCompare.cpp)
 
 add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 33e0952efedddec16acf6153209e14f18fd48134..1ca70ea84c867b83013625eaee141f5b75fad4ae 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
   }
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index cc7c1e7eb2734605cb278a4b97cab22bdba1594e..21918b86e1ad98766ceaf09dea3020d6e8592191 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 624fa20ca58bca3f16fa567487bbaa5d9656e1b1..58bc43a38ba9465a832fcd0652e6309c403577e3 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
   checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 27216ddb58eccd7fd52e121e795baf463ea69f51..04c856453d2ec4ad764e37ae430e3e30ac0dea0b 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
   testWrapper(functor);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 6aa5891bce922c00cbb4f69a511fb3c42d53f319..3836f7fc0fe577c463c9a476d49b21f2967043e5 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -28,10 +28,10 @@ limitations under the License. */
  * so we can add some tricks to prevent exp calculate an excessive value.
  *
  */
-#include <fenv.h>
+
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index d490078d909e7940e83a6f461f9386eeda02f53c..e6b5dba446b5a0022ade76b188895c4e0e2a22b4 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 
 #include <gtest/gtest.h>
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index adb5fbd9fa30d810a25a2eb11f6d57474c1304c7..1c21da5b76e95603258a5006d0c57b00126e65b9 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -181,28 +181,6 @@ TEST(Matrix, copyByRowIndex) {
   }
 }
 
-void testCosSim(int heightX, int heightY, int width, real scale) {
-  AutoCompare test(heightX, 1);
-  CpuMatrix arg1(heightX, width);
-  CpuMatrix arg2(heightY, width);
-  arg1.randomizeUniform();
-  arg2.randomizeUniform();
-  arg2.add(-0.5);
-  test.cmpWithArg(&Matrix::cosSim, arg1, arg2, scale);
-}
-
-TEST(Matrix, cosSim) {
-  for (auto heightX : {10, 100, 1000}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {10, 100, 1000}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSim(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluForward(int height, int width, int w_height, int w_width) {
   AutoCompare test(height, width);
   CpuMatrix arg1(height, width);
@@ -224,10 +202,11 @@ void testParamReluBackwardW(int height, int width, int w_height, int w_width) {
 }
 
 TEST(Matrix, paramRelu) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluForward(height, width, w_height, w_width);
           testParamReluBackwardW(height, width, w_height, w_width);
         }
@@ -291,10 +270,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8cc4c69a1a4d8afec08bf7fb13408e135a06c09c
--- /dev/null
+++ b/paddle/math/tests/test_RowBuffer.cpp
@@ -0,0 +1,65 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/math/RowBuffer.h"
+
+TEST(RowBuffer, testAutoGrow) {
+  paddle::RowBuffer buf(128);
+  ASSERT_EQ(128UL, buf.getWidth());
+  ASSERT_TRUE(buf.isAutoGrowth());
+  buf.resize(2);
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+
+  auto data = buf.getWithAutoGrowth(2);
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    data[i] = i;
+  }
+
+  ASSERT_EQ(3UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
+    }
+  }
+  for (size_t i = 0; i < buf.getWidth(); ++i) {
+    ASSERT_NEAR(i, buf.get(2)[i], 1e-5);
+  }
+}
+
+TEST(RowBuffer, testWithMemBuf) {
+  paddle::CpuMemHandlePtr mem =
+      std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
+  paddle::RowBuffer buf(mem, 128);
+  ASSERT_TRUE(!buf.isAutoGrowth());
+  ASSERT_EQ(2UL, buf.getRowCount());
+  for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
+    buf.data()[i] = i;
+  }
+  for (size_t i = 0; i < buf.getRowCount(); ++i) {
+    for (size_t j = 0; j < buf.getWidth(); ++j) {
+      ASSERT_NEAR(i * buf.getWidth() + j, buf.getWithAutoGrowth(i)[j], 1e-5);
+    }
+  }
+
+  ASSERT_DEATH_IF_SUPPORTED(buf.getWithAutoGrowth(3), ".*");
+}
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index f62843310d886ba7d449e793066b19a7cc7bd5a9..e8f9b26ff240f9c339404a919c14eb3e3704c1de 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
     ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
   }
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 0949ab7ffba423daedd47876bc055a21c5c3f016..c0572dfdbf738a4dfad04811b3a3e1b65487ff6d 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -248,11 +248,13 @@ TEST(Matrix, SparseMatrixTranspose) {
             /*dense matrix transpose*/
             CpuMatrixPtr matC(new CpuMatrix(height, width));
             matC->copyFrom(*matA);
-            CpuMatrixPtr matD(new CpuMatrix(width, height));
+            MatrixPtr matD(new CpuMatrix(width, height));
             matC->transpose(matD, false);
+
             /*check result*/
             checkSMatrixEqual2Dense(
-                std::dynamic_pointer_cast<CpuSparseMatrix>(matB), matD);
+                std::dynamic_pointer_cast<CpuSparseMatrix>(matB),
+                std::dynamic_pointer_cast<CpuMatrix>(matD));
           }
         }
       }
@@ -561,9 +563,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   checkSMatrixEqual2(matA, matD);
 #endif
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 1859b9fc13576b6f1d0bc13b43f7e7a2ef6030c9..40e38434fa328bba8be6e1b8e509023d615899c1 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 2c458cba9ca11e9af8a98b88a6392978c2a9be77..4a88844b43ef40af988d2b391d2bef4568dea9b7 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
 }
 
 TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 9925e24dc14294ec70806ffd9cc496ea01beaa43..4eb9837909ffaaf0f483ab65ece7a0b29fd49319 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 16541edb54b807d4e1690d4ae63fd44459e2d726..786d863a533b58ea9856300aaa0cd8f5a10a4dd9 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
   testMatrixCase(testSgdUpdate<GpuMatrix>);
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index c6fc849ba0328dae62c9da0bd721d86fd8b6881e..08b64c1bb6f5d359a2d2164e723a76c5360168ee 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -29,148 +29,6 @@ using namespace std;     // NOLINT
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
 
-void testMatrixProjectionForward(int contextStart,
-                                 int contextLength,
-                                 bool padding,
-                                 int batchSize,
-                                 int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeight = nullptr;
-  MatrixPtr gpuWeight = nullptr;
-  if (padding) {
-    cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeight->randomizeUniform();
-    gpuWeight->copyFrom(*cpuWeight);
-  }
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuOutput =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutput =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(*cpuInput,
-                                      cpuWeight.get(),
-                                      *cpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  gpuOutput->contextProjectionForward(*gpuInput,
-                                      gpuWeight.get(),
-                                      *gpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-}
-
-void testMatrixProjectionBackward(int contextStart,
-                                  int contextLength,
-                                  bool padding,
-                                  int batchSize,
-                                  int inputDim) {
-  MatrixPtr cpuOutputGrad =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutputGrad =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeightGrad = nullptr;
-  MatrixPtr gpuWeightGrad = nullptr;
-  if (padding) {
-    cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeightGrad->randomizeUniform();
-    gpuWeightGrad->copyFrom(*cpuWeightGrad);
-  }
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(),
-                                           cpuWeightGrad.get(),
-                                           *cpuSequence,
-                                           contextLength,
-                                           contextStart,
-                                           beginPad,
-                                           padding);
-  gpuOutputGrad->contextProjectionBackwardData(
-      *gpuInputGrad, *gpuSequence, contextLength, contextStart);
-  if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad,
-                                                   *gpuSequence,
-                                                   contextLength,
-                                                   contextStart,
-                                                   pad,
-                                                   beginPad);
-  }
-
-  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
-  if (padding) {
-    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
-  }
-}
-
-TEST(Matrix, projection) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto trainablePadding : {false, true}) {
-        for (auto batchSize : {1, 2, 5, 20, 100}) {
-          for (auto inputDim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " trainablePadding=" << trainablePadding
-                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart,
-                                        contextLength,
-                                        trainablePadding,
-                                        batchSize,
-                                        inputDim);
-            testMatrixProjectionBackward(contextStart,
-                                         contextLength,
-                                         trainablePadding,
-                                         batchSize,
-                                         inputDim);
-          }
-        }
-      }
-    }
-  }
-}
-
 void testMatrixMaxSequence(int batchSize, int inputDim) {
   // forward
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
@@ -247,6 +105,21 @@ void testMatrixGetSum(int height, int width) {
   EXPECT_LE(fabs(cpuSum - gpuSum), err);
 }
 
+void testMatrixGetMinMax(int height, int width) {
+  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(height, width);
+  cpuInput->randomizeUniform();
+  gpuInput->copyFrom(*cpuInput);
+
+  real cpuMin = cpuInput->getMin();
+  real gpuMin = gpuInput->getMin();
+  real cpuMax = cpuInput->getMax();
+  real gpuMax = gpuInput->getMax();
+
+  EXPECT_EQ(cpuMin, gpuMin);
+  EXPECT_EQ(cpuMax, gpuMax);
+}
+
 void testMatrixZeroAtOffset(int height, int width) {
   MatrixPtr cpuA = std::make_shared<CpuMatrix>(height, width);
   MatrixPtr gpuA = std::make_shared<GpuMatrix>(height, width);
@@ -303,11 +176,29 @@ void testMatrixTranspose(int height, int width) {
   cpu->randomizeUniform();
   gpu->copyFrom(*cpu);
   cpu->transpose(cpuT, false);
-  gpu->transpose(gpuT, false);
+  gpu->transpose(gpuT, true);
 
   TensorCheckEqual(*cpuT, *gpuT);
 }
 
+void testMatrixRotate(int height, int width) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, width);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, width);
+  MatrixPtr cpuR = std::make_shared<CpuMatrix>(width, height);
+  MatrixPtr gpuR = std::make_shared<GpuMatrix>(width, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+
+  cpu->rotate(cpuR, false, true);
+  gpu->rotate(gpuR, true, true);
+  TensorCheckEqual(*cpuR, *gpuR);
+
+  cpu->rotate(cpuR, true, false);
+  gpu->rotate(gpuR, false, false);
+  TensorCheckEqual(*cpuR, *gpuR);
+}
+
 void testMatrixInverse(int height) {
   MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
   MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
@@ -323,7 +214,7 @@ void testMatrixInverse(int height) {
   cpu->add(*outputCheck);
 
   gpu->copyFrom(*cpu);
-  cpu->inverse(cpuI, false);
+  cpu->inverse(cpuI, true);
   gpu->inverse(gpuI, false);
 
   TensorCheckErr(*cpuI, *gpuI);
@@ -342,6 +233,7 @@ TEST(Matrix, unary) {
       testMatrixZeroAtOffset(height, width);
       testMatrixGetSum(height, width);
       testMatrixTranspose(height, width);
+      testMatrixRotate(height, width);
     }
     // inverse
     testMatrixInverse(height);
@@ -828,61 +720,6 @@ TEST(Matrix, sequenceAvgForward) {
   }
 }
 
-void testCosSimDerivate(int heightX, int heightY, int width, real scale) {
-  MatrixPtr prevOutX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevOutY = CpuMatrix::create(heightY, width, false, false);
-  MatrixPtr grad = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr output = CpuMatrix::create(heightX, 1, false, false);
-  MatrixPtr prevGradX = CpuMatrix::create(heightX, width, false, false);
-  MatrixPtr prevGradY = CpuMatrix::create(heightY, width, false, false);
-
-  prevOutX->randomizeUniform();
-  prevOutY->randomizeUniform();
-  grad->randomizeUniform();
-  output->randomizeUniform();
-  prevGradX->randomizeUniform();
-  prevGradY->randomizeUniform();
-
-  MatrixPtr prevOutXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevOutYGpu = GpuMatrix::create(heightY, width, false, true);
-  MatrixPtr gradGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr outputGpu = GpuMatrix::create(heightX, 1, false, true);
-  MatrixPtr prevGradXGpu = GpuMatrix::create(heightX, width, false, true);
-  MatrixPtr prevGradYGpu = GpuMatrix::create(heightY, width, false, true);
-
-  prevOutXGpu->copyFrom(*prevOutX);
-  prevOutYGpu->copyFrom(*prevOutY);
-  gradGpu->copyFrom(*grad);
-  outputGpu->copyFrom(*output);
-  prevGradXGpu->copyFrom(*prevGradX);
-  prevGradYGpu->copyFrom(*prevGradY);
-
-  grad->cosSimDerivative(
-      *output, *prevOutX, *prevOutY, *prevGradX, *prevGradY, scale);
-
-  gradGpu->cosSimDerivative(*outputGpu,
-                            *prevOutXGpu,
-                            *prevOutYGpu,
-                            *prevGradXGpu,
-                            *prevGradYGpu,
-                            scale);
-
-  TensorCheckErr(*prevGradX, *prevGradXGpu);
-  TensorCheckErr(*prevGradY, *prevGradYGpu);
-}
-
-TEST(Matrix, cosSimDerivate) {
-  for (auto heightX : {1, 10, 100}) {
-    for (auto heightY : {1, heightX}) {
-      for (auto width : {1, 10, 100}) {
-        for (auto scale : {1.0, 2.0}) {
-          testCosSimDerivate(heightX, heightY, width, scale);
-        }
-      }
-    }
-  }
-}
-
 void testParamReluBackwardDiff(int height,
                                int width,
                                int w_height,
@@ -915,10 +752,11 @@ void testParamReluBackwardDiff(int height,
 }
 
 TEST(Matrix, paramReluBackwardDiff) {
-  for (auto height : {10, 100}) {
-    for (auto width : {10, 100}) {
+  for (auto height : {10, 40, 100}) {
+    for (auto width : {10, 40, 100}) {
       for (auto w_height : {1, 2}) {
         for (auto w_width : {1, 2}) {
+          if (width % (w_height * w_width)) continue;
           testParamReluBackwardDiff(height, width, w_height, w_width);
         }
       }
@@ -926,7 +764,7 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
-void testClassificationError(int numSamples, int dim) {
+void testClassificationError(int numSamples, int dim, int topkSize) {
   MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
   MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
   MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
@@ -939,17 +777,22 @@ void testClassificationError(int numSamples, int dim) {
   gpuOutput->copyFrom(*cpuOutput);
   gpuLabel->copyFrom(*cpuLabel);
 
-  cpuError->classificationError(*cpuOutput, *cpuLabel);
-  gpuError->classificationError(*gpuOutput, *gpuLabel);
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
 
   TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
-    for (auto dim : {1, 10, 100, 1000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testClassificationError(numSamples, dim);
+  for (auto numSamples : {1, 5, 31, 90, 150, 300}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
+      for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
     }
   }
 }
@@ -1262,10 +1105,4 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_matrixUtil.h b/paddle/math/tests/test_matrixUtil.h
index 9aa74b15193723970d80b5d1a4e0ac95341cd45a..47f461474622d13ea2f922a77348c78b450ec37f 100644
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
@@ -30,6 +30,17 @@ void checkMatrixEqual(const MatrixPtr& a, const MatrixPtr& b) {
   }
 }
 
+void checkSMatrixEqual(const CpuSparseMatrix& a, const CpuSparseMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+  ASSERT_EQ(a.getFormat(), b.getFormat());
+  ASSERT_EQ(a.getElementCnt(), b.getElementCnt());
+  for (size_t r = 0; r < a.getElementCnt(); ++r) {
+    ASSERT_FLOAT_EQ(a.getValue()[r], b.getValue()[r]);
+  }
+}
+
 void checkSMatrixEqual(const CpuSparseMatrixPtr& a,
                        const CpuSparseMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
@@ -73,6 +84,36 @@ void checkSMatrixEqual2(const CpuSparseMatrixPtr& a,
   }
 }
 
+void checkSMatrixEqual2Dense(const CpuSparseMatrix& a, const CpuMatrix& b) {
+  ASSERT_EQ(a.getWidth(), b.getWidth());
+  ASSERT_EQ(a.getHeight(), b.getHeight());
+  ASSERT_EQ(a.isTransposed(), b.isTransposed());
+
+  if (a.getFormat() == SPARSE_CSC) {
+    int* rows = a.getRows();
+    for (size_t i = 0; i < a.getWidth(); i++) {
+      for (size_t j = a.getColStartIdx(i); j < a.getColStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(rows[j], i));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(rows[j], i));
+        }
+      }
+    }
+  } else {
+    int* cols = a.getCols();
+    for (size_t i = 0; i < a.getHeight(); i++) {
+      for (size_t j = a.getRowStartIdx(i); j < a.getRowStartIdx(i + 1); j++) {
+        if (a.getValueType() == FLOAT_VALUE) {
+          ASSERT_FLOAT_EQ(a.getValue()[j], b.getElement(i, cols[j]));
+        } else {
+          ASSERT_FLOAT_EQ(1.0, b.getElement(i, cols[j]));
+        }
+      }
+    }
+  }
+}
+
 void checkSMatrixEqual2Dense(const CpuSparseMatrixPtr& a,
                              const CpuMatrixPtr& b) {
   ASSERT_EQ(a->getWidth(), b->getWidth());
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index dcdbccffc3a19faa177c9867fe7ab142612f5209..a9185a4b24b13ca0287b0f67375c4599e8b9ac78 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 65d01a15718ae2bebd4869eff0e5407524bc0e7c..7a343cca33f5b420be6192231ac73ca1c2da5fb9 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
 void Argument::subArgFrom(const Argument& input,
                           size_t offset,
                           size_t height,
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index afd2de0202bf0f14ec3d4c5b856455a3488e41f6..9ef44be0cb3b960db1e789f3f26bb66d1fe63c81 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -163,7 +163,7 @@ struct Argument {
                        : sequenceStartPositions->getData(false);
   }
 
-  static inline real sumCosts(const std::vector<Argument>& arguments) {
+  static inline real sum(const std::vector<Argument>& arguments) {
     real cost = 0;
     for (auto& arg : arguments) {
       if (arg.value) {
@@ -297,6 +297,23 @@ struct Argument {
    sequence has sub-sequence degrades to a sequence.
    */
   void degradeSequence(const Argument& input, bool useGpu);
+
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
 };
 
 }  // namespace paddle
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 1ee220d2dc1a26b3f394ca673975cc827f450206..2e7c18b8084dc25b9f2f7630390bb4553ac703c9 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/common.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 29d6e20dc16968cdda3e79b66b0c81aaaf303ef4..1ccded818796798105a889df978618688b56ed36 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -375,10 +375,6 @@ bool Parameter::load(const std::string& filename) {
   std::ifstream fs(filename, std::ios_base::binary);
   if (!fs) {
     LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (isStatic()) {
-      LOG(FATAL) << getName() << " is static but missing, not allowed.";
-      return false;
-    }
     if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
       LOG(FATAL) << getName() << " missing, not allowed.";
       return false;
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index e05137b315f254795de26a5ff0ac977e7968f4d8..72c8336799133ad3f5855b0c1aa06639179ff70a 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include "ParameterUpdaterHook.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 2cb379871716ffd9e75eede607276b6b3f200e6b..0fca280149c30f0241ec988dfd6719a5519808f4 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/math/Vector.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdaterBase.h b/paddle/parameter/ParameterUpdaterBase.h
index b230e170c15f1b004c5357fb7d0ad2204d01f44b..6265c828a1a254d01dc975b0155e7ac69df49a31 100644
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
@@ -55,7 +55,7 @@ public:
   // between startBatch() and finishBatch(), update() will be called
   // by the trainer multiple times, each time for updating one Parameter
   // with its gradient in PARAMETER_GRADIENT
-  virtual void update(Parameter* para) {
+  void update(Parameter* para) {
     SetDevice setDevice(para->getDeviceId());
     para->updateHook();
     this->updateImpl(para);
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index aa57a6346917b259dbb89f6ad2340fb8db28f3e3..8bab5a6289e2bb9f634e8cce4557de55f7704447 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -23,15 +23,6 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  int ret = RUN_ALL_TESTS();
-
-  return ret;
-}
-
 class CommonTest : public ::testing::Test {
 protected:
   CommonTest() : testStat_("test") {}
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index ccf05ae1ca3ab76fbe9d36237969207768de4dd2..11d7a147bf749ba2de0772b5efd5f73ab0ccdb1a 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "ParameterService.pb.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/pserver/ProtoServer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/CMakeLists.txt b/paddle/pserver/CMakeLists.txt
index 1c1e1964b8d3fd83c801f3988760a72dfc032e7f..b7f85ea1a6dfda2a37c315ba15c6ca1979cf4131 100644
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -24,13 +24,15 @@ set(PSERVER_SOURCES
     BaseClient.cpp
     ParameterClient2.cpp
     ParameterServer2.cpp
-    SparseParameterDistribution.cpp)
+    SparseParameterDistribution.cpp
+    ParameterServerController.cpp)
 
 set(PSERVER_HEADERS
     BaseClient.h
     ParameterClient2.h
     ParameterServer2.h
-    SparseParameterDistribution.h)
+    SparseParameterDistribution.h
+    ParameterServerController.h)
 
 add_library(paddle_pserver STATIC
     ${PSERVER_SOURCES})
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 70cfc6d70072f399ef97eef1a0e6111a127cbd9f..89b3ddd502151e537b81bdbb09f171dd6e13ba26 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -23,11 +23,11 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/pserver/BaseClient.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/common.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 79d1eb97ff149f4f5ca9a924c1b0b7ba629f1e33..0f5a5895907b20a0cf882b6fa6fb74bd52dce058 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -29,10 +29,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index ffc521f2c143d95ff07c3825e0a746cb31743d9b..845a2c27e242cfbe31679fea6eae13d2b400ec81 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -13,66 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fstream>
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Util.h"
-
-#include "ParameterServer2.h"
-#include "RDMANetwork.h"
-#include "paddle/utils/Flags.h"
+#include "ParameterServerController.h"
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
   initMain(argc, argv);
 
-  std::vector<std::string> devices;
-  std::vector<std::shared_ptr<ParameterServer2>> pservers;
-
-  // round robin to loadbalance RDMA server ENGINE
-  int rdmaCpu = 0;
-  int onlineCpus = rdma::numCpus();
-  int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-  if (FLAGS_nics.empty()) {
-    pservers.resize(numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      if (FLAGS_rdma_tcp == "rdma") {
-        pservers[i].reset(
-            new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-        rdmaCpu = rdmaCpu % onlineCpus;
-      } else {
-        pservers[i].reset(new ParameterServer2(std::string(), FLAGS_port + i));
-      }
-      CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                 << FLAGS_port + i;
-      LOG(INFO) << "pserver started : " << FLAGS_port + i;
-      pservers[i]->start();
-    }
-  } else {
-    str::split(FLAGS_nics, ',', &devices);
-    pservers.resize(devices.size() * numPorts);
-    for (int i = 0; i < numPorts; ++i) {
-      for (size_t j = 0; j < devices.size(); ++j) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i * devices.size() + j].reset(new ParameterServer2(
-              getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i * devices.size() + j].reset(
-              new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-        }
-        CHECK(pservers[i * devices.size() + j]->init())
-            << "Fail to initialize parameter server" << devices[j]
-            << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << devices[j] << ":"
-                  << FLAGS_port + i;
-        pservers[i * devices.size() + j]->start();
-      }
-    }
-  }
-
-  for (auto& pserver : pservers) {
-    pserver->join();
-  }
+  std::unique_ptr<ParameterServerController> parameterServerPtr(
+      paddle::ParameterServerController::createFromGflags());
+  parameterServerPtr->start();
+  parameterServerPtr->wait();
 
   return 0;
 }
diff --git a/paddle/pserver/ParameterServerController.cpp b/paddle/pserver/ParameterServerController.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d11a2e1acbc0f091901f3854ca99490d89ebe36
--- /dev/null
+++ b/paddle/pserver/ParameterServerController.cpp
@@ -0,0 +1,102 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ParameterServerController.h"
+
+namespace paddle {
+
+ParameterServerController::ParameterServerController(
+    const ParameterServerConfig& config) {
+  // round robin to load balance RDMA server ENGINE
+  std::vector<std::string> devices;
+  int rdmaCpu = 0;
+  int onlineCpus = rdma::numCpus();
+  int numPorts = config.ports_num() + config.ports_num_for_sparse();
+
+  if (config.nics().empty()) {
+    parameterServers_.resize(numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      if (config.rdma_tcp() == "rdma") {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i, rdmaCpu++));
+        rdmaCpu = rdmaCpu % onlineCpus;
+      } else {
+        parameterServers_[i].reset(
+            new ParameterServer2(std::string(), config.port() + i));
+      }
+      CHECK(parameterServers_[i]->init()) << "Fail to initialize parameter "
+                                             "server on port "
+                                          << config.port() + i;
+    }
+  } else {
+    str::split(config.nics(), ',', &devices);
+    parameterServers_.resize(devices.size() * numPorts);
+    for (int i = 0; i < numPorts; ++i) {
+      for (size_t j = 0; j < devices.size(); ++j) {
+        if (config.rdma_tcp() == "rdma") {
+          parameterServers_[i * devices.size() + j].reset(new ParameterServer2(
+              getIpAddr(devices[j]), config.port() + i, rdmaCpu++));
+          rdmaCpu = rdmaCpu % onlineCpus;
+        } else {
+          parameterServers_[i * devices.size() + j].reset(
+              new ParameterServer2(getIpAddr(devices[j]), config.port() + i));
+        }
+        CHECK(parameterServers_[i * devices.size() + j]->init())
+            << "Fail to initialize parameter server with device " << devices[j]
+            << config.port() + i;
+      }
+    }
+  }
+}
+
+ParameterServerController::~ParameterServerController() { this->wait(); }
+
+ParameterServerController* ParameterServerController::createFromGflags() {
+  ParameterServerConfig config;
+
+  config.set_nics(FLAGS_nics);
+  config.set_rdma_tcp(FLAGS_rdma_tcp);
+  config.set_port(FLAGS_port);
+  config.set_ports_num(FLAGS_ports_num);
+  config.set_ports_num_for_sparse(FLAGS_ports_num_for_sparse);
+
+  return create(config);
+}
+
+ParameterServerController* ParameterServerController::create(
+    const ParameterServerConfig& config) {
+  return new ParameterServerController(config);
+}
+
+void ParameterServerController::start() {
+  LOG(INFO) << "number of parameterServer instances: "
+            << parameterServers_.size();
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Starting parameterServer[" << i << "]";
+    parameterServer->start();
+    i++;
+  }
+}
+
+void ParameterServerController::wait() {
+  int i = 0;
+  for (const auto& parameterServer : parameterServers_) {
+    LOG(INFO) << "Waiting parameterServer[" << i << "]";
+    parameterServer->join();
+    i++;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/pserver/ParameterServerController.h b/paddle/pserver/ParameterServerController.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe9bb0b4d02339d0d31d5bc2942932e1f876098b
--- /dev/null
+++ b/paddle/pserver/ParameterServerController.h
@@ -0,0 +1,74 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ParameterServer2.h"
+#include "ParameterServerConfig.pb.h"
+#include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+
+namespace paddle {
+
+/**
+ * @brief ParameterServerController is used for create, init and manage multi
+ * parameter server instances. The num of the instances is decided by port
+ * num(the ports number for parameter send) and network devices configured
+ * by gflags or proto.
+ */
+class ParameterServerController final {
+public:
+  DISABLE_COPY(ParameterServerController);
+
+  /**
+   * @brief Ctor, Create a ParameterServerController from ParameterServerConfig.
+   */
+  explicit ParameterServerController(const ParameterServerConfig& config);
+
+  /**
+   * @brief Dtor.
+   */
+  ~ParameterServerController();
+
+  /**
+   * @brief create ParameterServerController from gflags, this is used for
+   * compatibility with the old usage of configuration by gflags.
+   */
+  static ParameterServerController* createFromGflags();
+
+  /**
+   * @brief create ParameterServerController with ParameterServerConfig, remove
+   * gflags from ParameterServer. Init all ParameterServer2 instances according
+   * to
+   * the config.
+   */
+  static ParameterServerController* create(const ParameterServerConfig& config);
+
+  /**
+   * @brief start all ParameterServer2 instances in this
+   * ParameterServerController.
+   */
+  void start();
+
+  /**
+   * @brief join and wait for all ParameterServer2 instances thread in this
+   * ParameterServerController.
+   */
+  void wait();
+
+private:
+  std::vector<std::unique_ptr<ParameterServer2>> parameterServers_;
+};
+
+}  // namespace paddle
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
index 64654f67d0c2c82f05a5038fb33b220f3cff0f39..6e8f9c37f64b70921e09241089a5a480fd8ca47f 100644
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -10,9 +10,11 @@ add_test(NAME socket_test
 add_unittest_without_exec(test_ProtoServer
     test_ProtoServer.cpp)
 
-add_test(NAME test_ProtoServer
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
-        ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+IF(NOT ON_TRAVIS)
+    add_test(NAME test_ProtoServer
+        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+            ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+ENDIF(NOT ON_TRAVIS)
 
 # TODO(yuyang18): Run test_ProtoServer when with rdma
 # add_test(NAME test_ProtoServerRDMA
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index 9f86ee80f4e5cc99ea3597b3ed37a387578f032a..04236fda2fb62b928b5c06ff38acfd3eb7217b08 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -12,14 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-
 #include <gtest/gtest.h>
-
+#include <memory>
 #include "ParameterService.pb.h"
 #include "paddle/math/Vector.h"
 #include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
 DEFINE_string(server_addr, "127.0.0.1", "Server address");
 DEFINE_int64(dim, 50000000, "Data size");
@@ -162,18 +161,9 @@ TEST(ProtoServer, extended) {
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
   testing::InitGoogleTest(&argc, argv);
-
-  MyServer* server;
-  if (FLAGS_rdma_tcp == "rdma") {
-    server = new MyServer(FLAGS_port, 0);
-  } else {
-    server = new MyServer(FLAGS_port);
-  }
-
-  server->start();
+  MyServer server(FLAGS_port, FLAGS_rdma_tcp == "rdma" ? 0 : -1);
+  server.start();
   usleep(10000);
 
-  int ret = RUN_ALL_TESTS();
-
-  exit(ret);
+  return RUN_ALL_TESTS();
 }
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 981d10afda2671be9e8f0da1a4bee755f7aa9d61..c009b05cdeeb9dbe2dc70048e6827a12445f677e 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -23,8 +23,18 @@ __all__ = ['DataProviderConverter']
 class IScanner(object):
     def __init__(self, input_type, pos):
         self.input_type = input_type
-        assert isinstance(self.input_type, dp2.InputType)
+        if not isinstance(self.input_type, dp2.InputType):
+            raise ValueError("input type should be dataprovider2.InputType")
         self.pos = pos
+        # data_in_gpu is used to indicate whether to create argument on GPU
+        # or not in GPU mode. Now if using one thread (trainer_count=1),
+        # trainer uses NeuralNetwork which needs to create argument on GPU
+        # before calling forward function. So, set data_in_gpu to True.
+        # Otherwise, trainer uses MultiGradientMachine which will transfer
+        # data from CPU to GPU in the forward function, set data_in_gpu to
+        # False in this case.
+        self.data_in_gpu = swig_paddle.isUsingGpu(
+        ) and swig_paddle.getTrainerCount() == 1
 
     def scan(self, dat):
         pass
@@ -34,6 +44,10 @@ class IScanner(object):
 
 
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
+
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
@@ -46,8 +60,10 @@ class DenseScanner(IScanner):
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
+                                                    self.data_in_gpu)
         argument.setSlotValue(self.pos, m)
 
 
@@ -57,7 +73,6 @@ class SparseBinaryScanner(IScanner):
         self.__rows__ = [0]
         self.__cols__ = []
         self.__height__ = 0
-        self.__nnz__ = 0
         self.__value__ = []
 
     def scan(self, dat):
@@ -70,11 +85,13 @@ class SparseBinaryScanner(IScanner):
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createSparse(self.__height__,
-                                            self.input_type.dim,
-                                            len(self.__cols__),
-                                            len(self.__value__) == 0)
+        m = swig_paddle.Matrix.createSparse(
+            self.__height__,
+            self.input_type.dim,
+            len(self.__cols__),
+            len(self.__value__) == 0,
+            False,  # trans
+            False)  # TODO supoort GPU
         assert isinstance(m, swig_paddle.Matrix)
         m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
         argument.setSlotValue(self.pos, m)
@@ -98,7 +115,7 @@ class IndexScanner(IScanner):
         self.__ids__.append(dat)
 
     def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__)
+        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
         assert isinstance(argument, swig_paddle.Arguments)
         argument.setSlotIds(self.pos, ids)
 
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index ce105d249aaf3e838443d3e0cf5996fe8c783a22..1c9455fab5f9c1179bddffb100cd53fe8adfb6b1 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -195,6 +195,12 @@ def __monkeypatch_gradient_machine__():
 
     swig_paddle.GradientMachine.getParameters = getParameters
 
+    def getNonStaticParameters(self):
+        return (self.getNonStaticParameter(i)
+                for i in xrange(self.getNonStaticParameterSize()))
+
+    swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters
+
     def getLayerOutputs(self, layerNames):
         """
         getLayerOutputs. get outputs of layers and return a numpy matrix dict.
@@ -208,7 +214,7 @@ def __monkeypatch_gradient_machine__():
 
         output = dict()
         for name in layerNames:
-            output[name] = __matrix_to_numpy__(self.getLayerOutput(name))
+            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
         return output
 
     swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
diff --git a/paddle/scripts/CMakeLists.txt b/paddle/scripts/CMakeLists.txt
index 1bae396a18688cd53e164774df07660ccc2451d7..66a46e1883a49d491f0cb3056a7039407d72e337 100644
--- a/paddle/scripts/CMakeLists.txt
+++ b/paddle/scripts/CMakeLists.txt
@@ -2,8 +2,16 @@ configure_file(submit_local.sh.in
     submit_local.sh
     @ONLY)
 
-
 install(FILES ${CMAKE_CURRENT_BINARY_DIR}/submit_local.sh DESTINATION bin
         PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
             GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
         RENAME paddle)
+
+configure_file(tools/usage_stat/usage.sh
+    usage.sh
+    @ONLY)
+
+install(FILES ${CMAKE_CURRENT_BINARY_DIR}/usage.sh DESTINATION opt/paddle/bin
+        PERMISSIONS OWNER_EXECUTE OWNER_WRITE OWNER_READ
+            GROUP_EXECUTE GROUP_READ WORLD_EXECUTE WORLD_READ
+        RENAME paddle_usage)
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index b01de499bd1fbcfff1f655535f574ae2caa17707..98eaa15a0fdf206c0a0f23a98ee718d70b34ff4b 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -4,43 +4,67 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
 
+# ENV variables
+ARG BUILD_WOBOQ
+ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 
+ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=OFF
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
-RUN mkdir /paddle
+ENV HOME /root
+
+# Add bash enhancements
+COPY ./paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y automake locales clang-format-3.8 && \
+    apt-get clean -y
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip install -U pre-commit 'requests==2.9.2' jupyter
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
+    cd .. && rm -rf cmake-3.4.1
+
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
-RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
-RUN paddle version  # print version after build
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index a68cc79b84271c63d41a89494150381d96748b67..4d30ccdd2b5de27c4dfd110537e69c269028ef31 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -4,43 +4,67 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-RUN apt-get update \
-    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libgtest-dev \
-    libatlas-dev libatlas3-base g++ m4 python-pip \
-    python-protobuf python-numpy python-dev swig openssh-server \
-    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    && apt-get clean -y
-RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
-RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
 
+# ENV variables
+ARG BUILD_WOBOQ
+ARG BUILD_AND_INSTALL
 ARG WITH_AVX
 ARG WITH_DOC
-ARG WITH_SWIG_PY
 ARG WITH_STYLE_CHECK
 
+ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=ON
 ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 
-RUN mkdir /paddle
+ENV HOME /root
+
+# Add bash enhancements
+COPY ./paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+    apt-get install -y git python-pip python-dev openssh-server bison && \
+    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
+    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
+    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
+    apt-get install -y automake locales clang-format-3.8 && \
+    apt-get clean -y
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel pillow BeautifulSoup && \
+    pip install -U docopt PyYAML sphinx && \
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip install -U pre-commit 'requests==2.9.2' jupyter
+
+RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
+    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
+    cd .. && rm -rf cmake-3.4.1
+
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
-RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
-RUN pip install /usr/local/opt/paddle/share/wheels/*.whl
-RUN paddle version  # print version after build
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd
 RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ca3f1c3f1896feaae657f47c121ce6cd858dc2c9..d9c44f42340323afb4930dab35114f2adc5fbb3a 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -8,40 +8,54 @@ function abort(){
 trap 'abort' 0
 set -e
 
-if [ ${WITH_GPU} == 'ON' ]; then
-  ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-fi
+# If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
+# source tree to /paddle, and this scripts should build it into
+# /paddle/build.
+if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
+    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+	ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+    fi
+
+    mkdir -p /paddle/build # -p means no error if exists
+    cd /paddle/build
+    cmake .. \
+	  -DWITH_DOC=${WITH_DOC:-OFF} \
+	  -DWITH_GPU=${WITH_GPU:-OFF} \
+	  -DWITH_AVX=${WITH_AVX:-OFF} \
+	  -DWITH_SWIG_PY=ON \
+	  -DCUDNN_ROOT=/usr/ \
+	  -DWITH_STYLE_CHECK=OFF \
+	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+    make -j `nproc`
+    make install
+
+    if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev 
+        # Install woboq_codebrowser.
+        git clone https://github.com/woboq/woboq_codebrowser /woboq
+        cd /woboq
+        cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+        -DCMAKE_BUILD_TYPE=Release \
+        .
+        make
 
-mkdir -p /paddle/build # -p means no error if exists
-cd /paddle/build
-cmake .. \
-      -DWITH_DOC=ON \
-      -DWITH_GPU=${WITH_GPU} \
-      -DWITH_AVX=${WITH_AVX} \
-      -DWITH_SWIG_PY=ON \
-      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=OFF \
-      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-make -j `nproc`
-make install
-
-# Install woboq_codebrowser.
-git clone https://github.com/woboq/woboq_codebrowser /woboq
-cd /woboq
-cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-      -DCMAKE_BUILD_TYPE=Release \
-      .
-make
-
-export WOBOQ_OUT=/usr/share/nginx/html/paddle
-export BUILD_DIR=/paddle/build
-mkdir -p $WOBOQ_OUT
-cp -rv /woboq/data $WOBOQ_OUT/../data
-/woboq/generator/codebrowser_generator \
-    -b /paddle/build \
-    -a \
-    -o $WOBOQ_OUT \
-    -p paddle:/paddle
-/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+        export WOBOQ_OUT=/usr/share/nginx/html/paddle
+        export BUILD_DIR=/paddle/build
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+        -b /paddle/build \
+        -a \
+        -o $WOBOQ_OUT \
+        -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+        cd /woboq
+        make clean
+    fi
+
+    pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
+    pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
+    paddle version
+fi
 
 trap : 0
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
new file mode 100755
index 0000000000000000000000000000000000000000..87083467f50acd689ce57b86951f5f7a03c6a58b
--- /dev/null
+++ b/paddle/scripts/docker/entrypoint
@@ -0,0 +1,8 @@
+#!/bin/bash
+LOG=/var/log/all
+
+touch $LOG
+
+/usr/sbin/sshd -D >> $LOG &
+jupyter notebook --ip=0.0.0.0 /notes/ >> $LOG &
+tail -f $LOG
diff --git a/paddle/scripts/docker/root/.bashrc b/paddle/scripts/docker/root/.bashrc
new file mode 100755
index 0000000000000000000000000000000000000000..4b3024e4e81a0fa206a796c12a8b9d72f1a8f5d9
--- /dev/null
+++ b/paddle/scripts/docker/root/.bashrc
@@ -0,0 +1,46 @@
+# Locales
+
+export LC_ALL=en_US.UTF-8
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US.UTF-8
+
+# Aliases
+
+alias rm='rm -i'
+alias cp='cp -i'
+alias mv='mv -i'
+
+alias ls='ls -hFG'
+alias l='ls -lF'
+alias ll='ls -alF'
+alias lt='ls -ltrF'
+alias ll='ls -alF'
+alias lls='ls -alSrF'
+alias llt='ls -altrF'
+
+# Colorize directory listing
+
+alias ls="ls -ph --color=auto"
+
+# Colorize grep
+
+if echo hello|grep --color=auto l >/dev/null 2>&1; then
+  export GREP_OPTIONS="--color=auto" GREP_COLOR="1;31"
+fi
+
+# Shell
+
+export CLICOLOR="1"
+
+YELLOW="\[\033[1;33m\]"
+NO_COLOUR="\[\033[0m\]"
+GREEN="\[\033[1;32m\]"
+WHITE="\[\033[1;37m\]"
+
+source ~/.scripts/git-prompt.sh
+
+export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w$YELLOW\$(__git_ps1 \" \[\033[35m\]{\[\033[36m\]%s\[\033[35m\]}\")$NO_COLOUR "
+
+# Git
+
+source ~/.scripts/git-completion.sh
diff --git a/paddle/scripts/docker/root/.gitconfig b/paddle/scripts/docker/root/.gitconfig
new file mode 100755
index 0000000000000000000000000000000000000000..6c249803a50403b9b79e36a13abe7fe88a35729d
--- /dev/null
+++ b/paddle/scripts/docker/root/.gitconfig
@@ -0,0 +1,43 @@
+[user]
+  name =
+  email =
+
+[alias]
+  st = status --branch --short
+  ci = commit
+  br = branch
+  co = checkout
+  df = diff
+  l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
+  ll = log --stat
+
+[merge]
+  tool = vimdiff
+
+[core]
+  excludesfile = ~/.gitignore
+  editor = vim
+
+[color]
+  branch = auto
+  diff = auto
+  status = auto
+
+[color "branch"]
+  current = yellow reverse
+  local = yellow
+  remote = green
+
+[color "diff"]
+  meta = yellow bold
+  frag = magenta bold
+  old = red bold
+  new = green bold
+
+[color "status"]
+  added = yellow
+  changed = green
+  untracked = cyan
+
+[push]
+  default = matching
\ No newline at end of file
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bdddef5ac2faf50b47dd03539dae8912bec8a16c
--- /dev/null
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
@@ -0,0 +1,2663 @@
+#!bash
+#
+# bash/zsh completion support for core Git.
+#
+# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
+# Conceptually based on gitcompletion (http://gitweb.hawaga.org.uk/).
+# Distributed under the GNU General Public License, version 2.0.
+#
+# The contained completion routines provide support for completing:
+#
+#    *) local and remote branch names
+#    *) local and remote tag names
+#    *) .git/remotes file names
+#    *) git 'subcommands'
+#    *) tree paths within 'ref:path/to/file' expressions
+#    *) file paths within current working directory and index
+#    *) common --long-options
+#
+# To use these routines:
+#
+#    1) Copy this file to somewhere (e.g. ~/.git-completion.sh).
+#    2) Add the following line to your .bashrc/.zshrc:
+#        source ~/.git-completion.sh
+#    3) Consider changing your PS1 to also show the current branch,
+#       see git-prompt.sh for details.
+
+case "$COMP_WORDBREAKS" in
+*:*) : great ;;
+*)   COMP_WORDBREAKS="$COMP_WORDBREAKS:"
+esac
+
+# __gitdir accepts 0 or 1 arguments (i.e., location)
+# returns location of .git repo
+__gitdir ()
+{
+  if [ -z "${1-}" ]; then
+    if [ -n "${__git_dir-}" ]; then
+      echo "$__git_dir"
+    elif [ -n "${GIT_DIR-}" ]; then
+      test -d "${GIT_DIR-}" || return 1
+      echo "$GIT_DIR"
+    elif [ -d .git ]; then
+      echo .git
+    else
+      git rev-parse --git-dir 2>/dev/null
+    fi
+  elif [ -d "$1/.git" ]; then
+    echo "$1/.git"
+  else
+    echo "$1"
+  fi
+}
+
+# The following function is based on code from:
+#
+#   bash_completion - programmable completion functions for bash 3.2+
+#
+#   Copyright © 2006-2008, Ian Macdonald <ian@caliban.org>
+#             © 2009-2010, Bash Completion Maintainers
+#                     <bash-completion-devel@lists.alioth.debian.org>
+#
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2, or (at your option)
+#   any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software Foundation,
+#   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+#   The latest version of this software can be obtained here:
+#
+#   http://bash-completion.alioth.debian.org/
+#
+#   RELEASE: 2.x
+
+# This function can be used to access a tokenized list of words
+# on the command line:
+#
+# __git_reassemble_comp_words_by_ref '=:'
+# if test "${words_[cword_-1]}" = -w
+# then
+#   ...
+# fi
+#
+# The argument should be a collection of characters from the list of
+# word completion separators (COMP_WORDBREAKS) to treat as ordinary
+# characters.
+#
+# This is roughly equivalent to going back in time and setting
+# COMP_WORDBREAKS to exclude those characters.  The intent is to
+# make option types like --date=<type> and <rev>:<path> easy to
+# recognize by treating each shell word as a single token.
+#
+# It is best not to set COMP_WORDBREAKS directly because the value is
+# shared with other completion scripts.  By the time the completion
+# function gets called, COMP_WORDS has already been populated so local
+# changes to COMP_WORDBREAKS have no effect.
+#
+# Output: words_, cword_, cur_.
+
+__git_reassemble_comp_words_by_ref()
+{
+  local exclude i j first
+  # Which word separators to exclude?
+  exclude="${1//[^$COMP_WORDBREAKS]}"
+  cword_=$COMP_CWORD
+  if [ -z "$exclude" ]; then
+    words_=("${COMP_WORDS[@]}")
+    return
+  fi
+  # List of word completion separators has shrunk;
+  # re-assemble words to complete.
+  for ((i=0, j=0; i < ${#COMP_WORDS[@]}; i++, j++)); do
+    # Append each nonempty word consisting of just
+    # word separator characters to the current word.
+    first=t
+    while
+      [ $i -gt 0 ] &&
+      [ -n "${COMP_WORDS[$i]}" ] &&
+      # word consists of excluded word separators
+      [ "${COMP_WORDS[$i]//[^$exclude]}" = "${COMP_WORDS[$i]}" ]
+    do
+      # Attach to the previous token,
+      # unless the previous token is the command name.
+      if [ $j -ge 2 ] && [ -n "$first" ]; then
+        ((j--))
+      fi
+      first=
+      words_[$j]=${words_[j]}${COMP_WORDS[i]}
+      if [ $i = $COMP_CWORD ]; then
+        cword_=$j
+      fi
+      if (($i < ${#COMP_WORDS[@]} - 1)); then
+        ((i++))
+      else
+        # Done.
+        return
+      fi
+    done
+    words_[$j]=${words_[j]}${COMP_WORDS[i]}
+    if [ $i = $COMP_CWORD ]; then
+      cword_=$j
+    fi
+  done
+}
+
+if ! type _get_comp_words_by_ref >/dev/null 2>&1; then
+_get_comp_words_by_ref ()
+{
+  local exclude cur_ words_ cword_
+  if [ "$1" = "-n" ]; then
+    exclude=$2
+    shift 2
+  fi
+  __git_reassemble_comp_words_by_ref "$exclude"
+  cur_=${words_[cword_]}
+  while [ $# -gt 0 ]; do
+    case "$1" in
+    cur)
+      cur=$cur_
+      ;;
+    prev)
+      prev=${words_[$cword_-1]}
+      ;;
+    words)
+      words=("${words_[@]}")
+      ;;
+    cword)
+      cword=$cword_
+      ;;
+    esac
+    shift
+  done
+}
+fi
+
+__gitcompadd ()
+{
+  local i=0
+  for x in $1; do
+    if [[ "$x" == "$3"* ]]; then
+      COMPREPLY[i++]="$2$x$4"
+    fi
+  done
+}
+
+# Generates completion reply, appending a space to possible completion words,
+# if necessary.
+# It accepts 1 to 4 arguments:
+# 1: List of possible completion words.
+# 2: A prefix to be added to each possible completion word (optional).
+# 3: Generate possible completion matches for this word (optional).
+# 4: A suffix to be appended to each possible completion word (optional).
+__gitcomp ()
+{
+  local cur_="${3-$cur}"
+
+  case "$cur_" in
+  --*=)
+    ;;
+  *)
+    local c i=0 IFS=$' \t\n'
+    for c in $1; do
+      c="$c${4-}"
+      if [[ $c == "$cur_"* ]]; then
+        case $c in
+        --*=*|*.) ;;
+        *) c="$c " ;;
+        esac
+        COMPREPLY[i++]="${2-}$c"
+      fi
+    done
+    ;;
+  esac
+}
+
+# Generates completion reply from newline-separated possible completion words
+# by appending a space to all of them.
+# It accepts 1 to 4 arguments:
+# 1: List of possible completion words, separated by a single newline.
+# 2: A prefix to be added to each possible completion word (optional).
+# 3: Generate possible completion matches for this word (optional).
+# 4: A suffix to be appended to each possible completion word instead of
+#    the default space (optional).  If specified but empty, nothing is
+#    appended.
+__gitcomp_nl ()
+{
+  local IFS=$'\n'
+  __gitcompadd "$1" "${2-}" "${3-$cur}" "${4- }"
+}
+
+# Generates completion reply with compgen from newline-separated possible
+# completion filenames.
+# It accepts 1 to 3 arguments:
+# 1: List of possible completion filenames, separated by a single newline.
+# 2: A directory prefix to be added to each possible completion filename
+#    (optional).
+# 3: Generate possible completion matches for this word (optional).
+__gitcomp_file ()
+{
+  local IFS=$'\n'
+
+  # XXX does not work when the directory prefix contains a tilde,
+  # since tilde expansion is not applied.
+  # This means that COMPREPLY will be empty and Bash default
+  # completion will be used.
+  __gitcompadd "$1" "${2-}" "${3-$cur}" ""
+
+  # use a hack to enable file mode in bash < 4
+  compopt -o filenames +o nospace 2>/dev/null ||
+  compgen -f /non-existing-dir/ > /dev/null
+}
+
+# Execute 'git ls-files', unless the --committable option is specified, in
+# which case it runs 'git diff-index' to find out the files that can be
+# committed.  It return paths relative to the directory specified in the first
+# argument, and using the options specified in the second argument.
+__git_ls_files_helper ()
+{
+  (
+    test -n "${CDPATH+set}" && unset CDPATH
+    cd "$1"
+    if [ "$2" == "--committable" ]; then
+      git diff-index --name-only --relative HEAD
+    else
+      # NOTE: $2 is not quoted in order to support multiple options
+      git ls-files --exclude-standard $2
+    fi
+  ) 2>/dev/null
+}
+
+
+# __git_index_files accepts 1 or 2 arguments:
+# 1: Options to pass to ls-files (required).
+# 2: A directory path (optional).
+#    If provided, only files within the specified directory are listed.
+#    Sub directories are never recursed.  Path must have a trailing
+#    slash.
+__git_index_files ()
+{
+  local dir="$(__gitdir)" root="${2-.}" file
+
+  if [ -d "$dir" ]; then
+    __git_ls_files_helper "$root" "$1" |
+    while read -r file; do
+      case "$file" in
+      ?*/*) echo "${file%%/*}" ;;
+      *) echo "$file" ;;
+      esac
+    done | sort | uniq
+  fi
+}
+
+__git_heads ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir" ]; then
+    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
+      refs/heads
+    return
+  fi
+}
+
+__git_tags ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir" ]; then
+    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
+      refs/tags
+    return
+  fi
+}
+
+# __git_refs accepts 0, 1 (to pass to __gitdir), or 2 arguments
+# presence of 2nd argument means use the guess heuristic employed
+# by checkout for tracking branches
+__git_refs ()
+{
+  local i hash dir="$(__gitdir "${1-}")" track="${2-}"
+  local format refs
+  if [ -d "$dir" ]; then
+    case "$cur" in
+    refs|refs/*)
+      format="refname"
+      refs="${cur%/*}"
+      track=""
+      ;;
+    *)
+      for i in HEAD FETCH_HEAD ORIG_HEAD MERGE_HEAD; do
+        if [ -e "$dir/$i" ]; then echo $i; fi
+      done
+      format="refname:short"
+      refs="refs/tags refs/heads refs/remotes"
+      ;;
+    esac
+    git --git-dir="$dir" for-each-ref --format="%($format)" \
+      $refs
+    if [ -n "$track" ]; then
+      # employ the heuristic used by git checkout
+      # Try to find a remote branch that matches the completion word
+      # but only output if the branch name is unique
+      local ref entry
+      git --git-dir="$dir" for-each-ref --shell --format="ref=%(refname:short)" \
+        "refs/remotes/" | \
+      while read -r entry; do
+        eval "$entry"
+        ref="${ref#*/}"
+        if [[ "$ref" == "$cur"* ]]; then
+          echo "$ref"
+        fi
+      done | sort | uniq -u
+    fi
+    return
+  fi
+  case "$cur" in
+  refs|refs/*)
+    git ls-remote "$dir" "$cur*" 2>/dev/null | \
+    while read -r hash i; do
+      case "$i" in
+      *^{}) ;;
+      *) echo "$i" ;;
+      esac
+    done
+    ;;
+  *)
+    echo "HEAD"
+    git for-each-ref --format="%(refname:short)" -- "refs/remotes/$dir/" | sed -e "s#^$dir/##"
+    ;;
+  esac
+}
+
+# __git_refs2 requires 1 argument (to pass to __git_refs)
+__git_refs2 ()
+{
+  local i
+  for i in $(__git_refs "$1"); do
+    echo "$i:$i"
+  done
+}
+
+# __git_refs_remotes requires 1 argument (to pass to ls-remote)
+__git_refs_remotes ()
+{
+  local i hash
+  git ls-remote "$1" 'refs/heads/*' 2>/dev/null | \
+  while read -r hash i; do
+    echo "$i:refs/remotes/$1/${i#refs/heads/}"
+  done
+}
+
+__git_remotes ()
+{
+  local i IFS=$'\n' d="$(__gitdir)"
+  test -d "$d/remotes" && ls -1 "$d/remotes"
+  for i in $(git --git-dir="$d" config --get-regexp 'remote\..*\.url' 2>/dev/null); do
+    i="${i#remote.}"
+    echo "${i/.url*/}"
+  done
+}
+
+__git_list_merge_strategies ()
+{
+  git merge -s help 2>&1 |
+  sed -n -e '/[Aa]vailable strategies are: /,/^$/{
+    s/\.$//
+    s/.*://
+    s/^[  ]*//
+    s/[   ]*$//
+    p
+  }'
+}
+
+__git_merge_strategies=
+# 'git merge -s help' (and thus detection of the merge strategy
+# list) fails, unfortunately, if run outside of any git working
+# tree.  __git_merge_strategies is set to the empty string in
+# that case, and the detection will be repeated the next time it
+# is needed.
+__git_compute_merge_strategies ()
+{
+  test -n "$__git_merge_strategies" ||
+  __git_merge_strategies=$(__git_list_merge_strategies)
+}
+
+__git_complete_revlist_file ()
+{
+  local pfx ls ref cur_="$cur"
+  case "$cur_" in
+  *..?*:*)
+    return
+    ;;
+  ?*:*)
+    ref="${cur_%%:*}"
+    cur_="${cur_#*:}"
+    case "$cur_" in
+    ?*/*)
+      pfx="${cur_%/*}"
+      cur_="${cur_##*/}"
+      ls="$ref:$pfx"
+      pfx="$pfx/"
+      ;;
+    *)
+      ls="$ref"
+      ;;
+    esac
+
+    case "$COMP_WORDBREAKS" in
+    *:*) : great ;;
+    *)   pfx="$ref:$pfx" ;;
+    esac
+
+    __gitcomp_nl "$(git --git-dir="$(__gitdir)" ls-tree "$ls" 2>/dev/null \
+        | sed '/^100... blob /{
+                   s,^.*  ,,
+                   s,$, ,
+               }
+               /^120000 blob /{
+                   s,^.*  ,,
+                   s,$, ,
+               }
+               /^040000 tree /{
+                   s,^.*  ,,
+                   s,$,/,
+               }
+               s/^.*  //')" \
+      "$pfx" "$cur_" ""
+    ;;
+  *...*)
+    pfx="${cur_%...*}..."
+    cur_="${cur_#*...}"
+    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    ;;
+  *..*)
+    pfx="${cur_%..*}.."
+    cur_="${cur_#*..}"
+    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+
+# __git_complete_index_file requires 1 argument:
+# 1: the options to pass to ls-file
+#
+# The exception is --committable, which finds the files appropriate commit.
+__git_complete_index_file ()
+{
+  local pfx="" cur_="$cur"
+
+  case "$cur_" in
+  ?*/*)
+    pfx="${cur_%/*}"
+    cur_="${cur_##*/}"
+    pfx="${pfx}/"
+    ;;
+  esac
+
+  __gitcomp_file "$(__git_index_files "$1" "$pfx")" "$pfx" "$cur_"
+}
+
+__git_complete_file ()
+{
+  __git_complete_revlist_file
+}
+
+__git_complete_revlist ()
+{
+  __git_complete_revlist_file
+}
+
+__git_complete_remote_or_refspec ()
+{
+  local cur_="$cur" cmd="${words[1]}"
+  local i c=2 remote="" pfx="" lhs=1 no_complete_refspec=0
+  if [ "$cmd" = "remote" ]; then
+    ((c++))
+  fi
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    --mirror) [ "$cmd" = "push" ] && no_complete_refspec=1 ;;
+    --all)
+      case "$cmd" in
+      push) no_complete_refspec=1 ;;
+      fetch)
+        return
+        ;;
+      *) ;;
+      esac
+      ;;
+    -*) ;;
+    *) remote="$i"; break ;;
+    esac
+    ((c++))
+  done
+  if [ -z "$remote" ]; then
+    __gitcomp_nl "$(__git_remotes)"
+    return
+  fi
+  if [ $no_complete_refspec = 1 ]; then
+    return
+  fi
+  [ "$remote" = "." ] && remote=
+  case "$cur_" in
+  *:*)
+    case "$COMP_WORDBREAKS" in
+    *:*) : great ;;
+    *)   pfx="${cur_%%:*}:" ;;
+    esac
+    cur_="${cur_#*:}"
+    lhs=0
+    ;;
+  +*)
+    pfx="+"
+    cur_="${cur_#+}"
+    ;;
+  esac
+  case "$cmd" in
+  fetch)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs2 "$remote")" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    fi
+    ;;
+  pull|remote)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    fi
+    ;;
+  push)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
+    fi
+    ;;
+  esac
+}
+
+__git_complete_strategy ()
+{
+  __git_compute_merge_strategies
+  case "$prev" in
+  -s|--strategy)
+    __gitcomp "$__git_merge_strategies"
+    return 0
+  esac
+  case "$cur" in
+  --strategy=*)
+    __gitcomp "$__git_merge_strategies" "" "${cur##--strategy=}"
+    return 0
+    ;;
+  esac
+  return 1
+}
+
+__git_commands () {
+  if test -n "${GIT_TESTING_COMMAND_COMPLETION:-}"
+  then
+    printf "%s" "${GIT_TESTING_COMMAND_COMPLETION}"
+  else
+    git help -a|egrep '^  [a-zA-Z0-9]'
+  fi
+}
+
+__git_list_all_commands ()
+{
+  local i IFS=" "$'\n'
+  for i in $(__git_commands)
+  do
+    case $i in
+    *--*)             : helper pattern;;
+    *) echo $i;;
+    esac
+  done
+}
+
+__git_all_commands=
+__git_compute_all_commands ()
+{
+  test -n "$__git_all_commands" ||
+  __git_all_commands=$(__git_list_all_commands)
+}
+
+__git_list_porcelain_commands ()
+{
+  local i IFS=" "$'\n'
+  __git_compute_all_commands
+  for i in $__git_all_commands
+  do
+    case $i in
+    *--*)             : helper pattern;;
+    applymbox)        : ask gittus;;
+    applypatch)       : ask gittus;;
+    archimport)       : import;;
+    cat-file)         : plumbing;;
+    check-attr)       : plumbing;;
+    check-ignore)     : plumbing;;
+    check-mailmap)    : plumbing;;
+    check-ref-format) : plumbing;;
+    checkout-index)   : plumbing;;
+    commit-tree)      : plumbing;;
+    count-objects)    : infrequent;;
+    credential-cache) : credentials helper;;
+    credential-store) : credentials helper;;
+    cvsexportcommit)  : export;;
+    cvsimport)        : import;;
+    cvsserver)        : daemon;;
+    daemon)           : daemon;;
+    diff-files)       : plumbing;;
+    diff-index)       : plumbing;;
+    diff-tree)        : plumbing;;
+    fast-import)      : import;;
+    fast-export)      : export;;
+    fsck-objects)     : plumbing;;
+    fetch-pack)       : plumbing;;
+    fmt-merge-msg)    : plumbing;;
+    for-each-ref)     : plumbing;;
+    hash-object)      : plumbing;;
+    http-*)           : transport;;
+    index-pack)       : plumbing;;
+    init-db)          : deprecated;;
+    local-fetch)      : plumbing;;
+    lost-found)       : infrequent;;
+    ls-files)         : plumbing;;
+    ls-remote)        : plumbing;;
+    ls-tree)          : plumbing;;
+    mailinfo)         : plumbing;;
+    mailsplit)        : plumbing;;
+    merge-*)          : plumbing;;
+    mktree)           : plumbing;;
+    mktag)            : plumbing;;
+    pack-objects)     : plumbing;;
+    pack-redundant)   : plumbing;;
+    pack-refs)        : plumbing;;
+    parse-remote)     : plumbing;;
+    patch-id)         : plumbing;;
+    peek-remote)      : plumbing;;
+    prune)            : plumbing;;
+    prune-packed)     : plumbing;;
+    quiltimport)      : import;;
+    read-tree)        : plumbing;;
+    receive-pack)     : plumbing;;
+    remote-*)         : transport;;
+    repo-config)      : deprecated;;
+    rerere)           : plumbing;;
+    rev-list)         : plumbing;;
+    rev-parse)        : plumbing;;
+    runstatus)        : plumbing;;
+    sh-setup)         : internal;;
+    shell)            : daemon;;
+    show-ref)         : plumbing;;
+    send-pack)        : plumbing;;
+    show-index)       : plumbing;;
+    ssh-*)            : transport;;
+    stripspace)       : plumbing;;
+    symbolic-ref)     : plumbing;;
+    tar-tree)         : deprecated;;
+    unpack-file)      : plumbing;;
+    unpack-objects)   : plumbing;;
+    update-index)     : plumbing;;
+    update-ref)       : plumbing;;
+    update-server-info) : daemon;;
+    upload-archive)   : plumbing;;
+    upload-pack)      : plumbing;;
+    write-tree)       : plumbing;;
+    var)              : infrequent;;
+    verify-pack)      : infrequent;;
+    verify-tag)       : plumbing;;
+    *) echo $i;;
+    esac
+  done
+}
+
+__git_porcelain_commands=
+__git_compute_porcelain_commands ()
+{
+  __git_compute_all_commands
+  test -n "$__git_porcelain_commands" ||
+  __git_porcelain_commands=$(__git_list_porcelain_commands)
+}
+
+__git_pretty_aliases ()
+{
+  local i IFS=$'\n'
+  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "pretty\..*" 2>/dev/null); do
+    case "$i" in
+    pretty.*)
+      i="${i#pretty.}"
+      echo "${i/ */}"
+      ;;
+    esac
+  done
+}
+
+__git_aliases ()
+{
+  local i IFS=$'\n'
+  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "alias\..*" 2>/dev/null); do
+    case "$i" in
+    alias.*)
+      i="${i#alias.}"
+      echo "${i/ */}"
+      ;;
+    esac
+  done
+}
+
+# __git_aliased_command requires 1 argument
+__git_aliased_command ()
+{
+  local word cmdline=$(git --git-dir="$(__gitdir)" \
+    config --get "alias.$1")
+  for word in $cmdline; do
+    case "$word" in
+    \!gitk|gitk)
+      echo "gitk"
+      return
+      ;;
+    \!*)  : shell command alias ;;
+    -*) : option ;;
+    *=*)  : setting env ;;
+    git)  : git itself ;;
+    *)
+      echo "$word"
+      return
+    esac
+  done
+}
+
+# __git_find_on_cmdline requires 1 argument
+__git_find_on_cmdline ()
+{
+  local word subcommand c=1
+  while [ $c -lt $cword ]; do
+    word="${words[c]}"
+    for subcommand in $1; do
+      if [ "$subcommand" = "$word" ]; then
+        echo "$subcommand"
+        return
+      fi
+    done
+    ((c++))
+  done
+}
+
+__git_has_doubledash ()
+{
+  local c=1
+  while [ $c -lt $cword ]; do
+    if [ "--" = "${words[c]}" ]; then
+      return 0
+    fi
+    ((c++))
+  done
+  return 1
+}
+
+# Try to count non option arguments passed on the command line for the
+# specified git command.
+# When options are used, it is necessary to use the special -- option to
+# tell the implementation were non option arguments begin.
+# XXX this can not be improved, since options can appear everywhere, as
+# an example:
+# git mv x -n y
+#
+# __git_count_arguments requires 1 argument: the git command executed.
+__git_count_arguments ()
+{
+  local word i c=0
+
+  # Skip "git" (first argument)
+  for ((i=1; i < ${#words[@]}; i++)); do
+    word="${words[i]}"
+
+    case "$word" in
+      --)
+        # Good; we can assume that the following are only non
+        # option arguments.
+        ((c = 0))
+        ;;
+      "$1")
+        # Skip the specified git command and discard git
+        # main options
+        ((c = 0))
+        ;;
+      ?*)
+        ((c++))
+        ;;
+    esac
+  done
+
+  printf "%d" $c
+}
+
+__git_whitespacelist="nowarn warn error error-all fix"
+
+_git_am ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir"/rebase-apply ]; then
+    __gitcomp "--skip --continue --resolved --abort"
+    return
+  fi
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --3way --committer-date-is-author-date --ignore-date
+      --ignore-whitespace --ignore-space-change
+      --interactive --keep --no-utf8 --signoff --utf8
+      --whitespace= --scissors
+      "
+    return
+  esac
+}
+
+_git_apply ()
+{
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --stat --numstat --summary --check --index
+      --cached --index-info --reverse --reject --unidiff-zero
+      --apply --no-add --exclude=
+      --ignore-whitespace --ignore-space-change
+      --whitespace= --inaccurate-eof --verbose
+      "
+    return
+  esac
+}
+
+_git_add ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --interactive --refresh --patch --update --dry-run
+      --ignore-errors --intent-to-add
+      "
+    return
+  esac
+
+  # XXX should we check for --update and --all options ?
+  __git_complete_index_file "--others --modified"
+}
+
+_git_archive ()
+{
+  case "$cur" in
+  --format=*)
+    __gitcomp "$(git archive --list)" "" "${cur##--format=}"
+    return
+    ;;
+  --remote=*)
+    __gitcomp_nl "$(__git_remotes)" "" "${cur##--remote=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --format= --list --verbose
+      --prefix= --remote= --exec=
+      "
+    return
+    ;;
+  esac
+  __git_complete_file
+}
+
+_git_bisect ()
+{
+  __git_has_doubledash && return
+
+  local subcommands="start bad good skip reset visualize replay log run"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    if [ -f "$(__gitdir)"/BISECT_START ]; then
+      __gitcomp "$subcommands"
+    else
+      __gitcomp "replay start"
+    fi
+    return
+  fi
+
+  case "$subcommand" in
+  bad|good|reset|skip|start)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  *)
+    ;;
+  esac
+}
+
+_git_branch ()
+{
+  local i c=1 only_local_ref="n" has_r="n"
+
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    -d|-m)  only_local_ref="y" ;;
+    -r) has_r="y" ;;
+    esac
+    ((c++))
+  done
+
+  case "$cur" in
+  --set-upstream-to=*)
+    __gitcomp "$(__git_refs)" "" "${cur##--set-upstream-to=}"
+    ;;
+  --*)
+    __gitcomp "
+      --color --no-color --verbose --abbrev= --no-abbrev
+      --track --no-track --contains --merged --no-merged
+      --set-upstream-to= --edit-description --list
+      --unset-upstream
+      "
+    ;;
+  *)
+    if [ $only_local_ref = "y" -a $has_r = "n" ]; then
+      __gitcomp_nl "$(__git_heads)"
+    else
+      __gitcomp_nl "$(__git_refs)"
+    fi
+    ;;
+  esac
+}
+
+_git_bundle ()
+{
+  local cmd="${words[2]}"
+  case "$cword" in
+  2)
+    __gitcomp "create list-heads verify unbundle"
+    ;;
+  3)
+    # looking for a file
+    ;;
+  *)
+    case "$cmd" in
+      create)
+        __git_complete_revlist
+      ;;
+    esac
+    ;;
+  esac
+}
+
+_git_checkout ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --conflict=*)
+    __gitcomp "diff3 merge" "" "${cur##--conflict=}"
+    ;;
+  --*)
+    __gitcomp "
+      --quiet --ours --theirs --track --no-track --merge
+      --conflict= --orphan --patch
+      "
+    ;;
+  *)
+    # check if --track, --no-track, or --no-guess was specified
+    # if so, disable DWIM mode
+    local flags="--track --no-track --no-guess" track=1
+    if [ -n "$(__git_find_on_cmdline "$flags")" ]; then
+      track=''
+    fi
+    __gitcomp_nl "$(__git_refs '' $track)"
+    ;;
+  esac
+}
+
+_git_cherry ()
+{
+  __gitcomp "$(__git_refs)"
+}
+
+_git_cherry_pick ()
+{
+  local dir="$(__gitdir)"
+  if [ -f "$dir"/CHERRY_PICK_HEAD ]; then
+    __gitcomp "--continue --quit --abort"
+    return
+  fi
+  case "$cur" in
+  --*)
+    __gitcomp "--edit --no-commit --signoff --strategy= --mainline"
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+_git_clean ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--dry-run --quiet"
+    return
+    ;;
+  esac
+
+  # XXX should we check for -x option ?
+  __git_complete_index_file "--others"
+}
+
+_git_clone ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --local
+      --no-hardlinks
+      --shared
+      --reference
+      --quiet
+      --no-checkout
+      --bare
+      --mirror
+      --origin
+      --upload-pack
+      --template=
+      --depth
+      --single-branch
+      --branch
+      "
+    return
+    ;;
+  esac
+}
+
+_git_commit ()
+{
+  case "$prev" in
+  -c|-C)
+    __gitcomp_nl "$(__git_refs)" "" "${cur}"
+    return
+    ;;
+  esac
+
+  case "$cur" in
+  --cleanup=*)
+    __gitcomp "default strip verbatim whitespace
+      " "" "${cur##--cleanup=}"
+    return
+    ;;
+  --reuse-message=*|--reedit-message=*|\
+  --fixup=*|--squash=*)
+    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
+    return
+    ;;
+  --untracked-files=*)
+    __gitcomp "all no normal" "" "${cur##--untracked-files=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --all --author= --signoff --verify --no-verify
+      --edit --no-edit
+      --amend --include --only --interactive
+      --dry-run --reuse-message= --reedit-message=
+      --reset-author --file= --message= --template=
+      --cleanup= --untracked-files --untracked-files=
+      --verbose --quiet --fixup= --squash=
+      "
+    return
+  esac
+
+  if git rev-parse --verify --quiet HEAD >/dev/null; then
+    __git_complete_index_file "--committable"
+  else
+    # This is the first commit
+    __git_complete_index_file "--cached"
+  fi
+}
+
+_git_describe ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --all --tags --contains --abbrev= --candidates=
+      --exact-match --debug --long --match --always
+      "
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+__git_diff_algorithms="myers minimal patience histogram"
+
+__git_diff_common_options="--stat --numstat --shortstat --summary
+      --patch-with-stat --name-only --name-status --color
+      --no-color --color-words --no-renames --check
+      --full-index --binary --abbrev --diff-filter=
+      --find-copies-harder
+      --text --ignore-space-at-eol --ignore-space-change
+      --ignore-all-space --exit-code --quiet --ext-diff
+      --no-ext-diff
+      --no-prefix --src-prefix= --dst-prefix=
+      --inter-hunk-context=
+      --patience --histogram --minimal
+      --raw --word-diff
+      --dirstat --dirstat= --dirstat-by-file
+      --dirstat-by-file= --cumulative
+      --diff-algorithm=
+"
+
+_git_diff ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --diff-algorithm=*)
+    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
+      --base --ours --theirs --no-index
+      $__git_diff_common_options
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+__git_mergetools_common="diffuse ecmerge emerge kdiff3 meld opendiff
+      tkdiff vimdiff gvimdiff xxdiff araxis p4merge bc3 codecompare
+"
+
+_git_difftool ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --tool=*)
+    __gitcomp "$__git_mergetools_common kompare" "" "${cur##--tool=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
+      --base --ours --theirs
+      --no-renames --diff-filter= --find-copies-harder
+      --relative --ignore-submodules
+      --tool="
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+__git_fetch_options="
+  --quiet --verbose --append --upload-pack --force --keep --depth=
+  --tags --no-tags --all --prune --dry-run
+"
+
+_git_fetch ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "$__git_fetch_options"
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+__git_format_patch_options="
+  --stdout --attach --no-attach --thread --thread= --no-thread
+  --numbered --start-number --numbered-files --keep-subject --signoff
+  --signature --no-signature --in-reply-to= --cc= --full-index --binary
+  --not --all --cover-letter --no-prefix --src-prefix= --dst-prefix=
+  --inline --suffix= --ignore-if-in-upstream --subject-prefix=
+  --output-directory --reroll-count --to= --quiet --notes
+"
+
+_git_format_patch ()
+{
+  case "$cur" in
+  --thread=*)
+    __gitcomp "
+      deep shallow
+      " "" "${cur##--thread=}"
+    return
+    ;;
+  --*)
+    __gitcomp "$__git_format_patch_options"
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_fsck ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --tags --root --unreachable --cache --no-reflogs --full
+      --strict --verbose --lost-found
+      "
+    return
+    ;;
+  esac
+}
+
+_git_gc ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--prune --aggressive"
+    return
+    ;;
+  esac
+}
+
+_git_gitk ()
+{
+  _gitk
+}
+
+__git_match_ctag() {
+  awk "/^${1////\\/}/ { print \$1 }" "$2"
+}
+
+_git_grep ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --cached
+      --text --ignore-case --word-regexp --invert-match
+      --full-name --line-number
+      --extended-regexp --basic-regexp --fixed-strings
+      --perl-regexp
+      --files-with-matches --name-only
+      --files-without-match
+      --max-depth
+      --count
+      --and --or --not --all-match
+      "
+    return
+    ;;
+  esac
+
+  case "$cword,$prev" in
+  2,*|*,-*)
+    if test -r tags; then
+      __gitcomp_nl "$(__git_match_ctag "$cur" tags)"
+      return
+    fi
+    ;;
+  esac
+
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_help ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--all --info --man --web"
+    return
+    ;;
+  esac
+  __git_compute_all_commands
+  __gitcomp "$__git_all_commands $(__git_aliases)
+    attributes cli core-tutorial cvs-migration
+    diffcore gitk glossary hooks ignore modules
+    namespaces repository-layout tutorial tutorial-2
+    workflows
+    "
+}
+
+_git_init ()
+{
+  case "$cur" in
+  --shared=*)
+    __gitcomp "
+      false true umask group all world everybody
+      " "" "${cur##--shared=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--quiet --bare --template= --shared --shared="
+    return
+    ;;
+  esac
+}
+
+_git_ls_files ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--cached --deleted --modified --others --ignored
+      --stage --directory --no-empty-directory --unmerged
+      --killed --exclude= --exclude-from=
+      --exclude-per-directory= --exclude-standard
+      --error-unmatch --with-tree= --full-name
+      --abbrev --ignored --exclude-per-directory
+      "
+    return
+    ;;
+  esac
+
+  # XXX ignore options like --modified and always suggest all cached
+  # files.
+  __git_complete_index_file "--cached"
+}
+
+_git_ls_remote ()
+{
+  __gitcomp_nl "$(__git_remotes)"
+}
+
+_git_ls_tree ()
+{
+  __git_complete_file
+}
+
+# Options that go well for log, shortlog and gitk
+__git_log_common_options="
+  --not --all
+  --branches --tags --remotes
+  --first-parent --merges --no-merges
+  --max-count=
+  --max-age= --since= --after=
+  --min-age= --until= --before=
+  --min-parents= --max-parents=
+  --no-min-parents --no-max-parents
+"
+# Options that go well for log and gitk (not shortlog)
+__git_log_gitk_options="
+  --dense --sparse --full-history
+  --simplify-merges --simplify-by-decoration
+  --left-right --notes --no-notes
+"
+# Options that go well for log and shortlog (not gitk)
+__git_log_shortlog_options="
+  --author= --committer= --grep=
+  --all-match
+"
+
+__git_log_pretty_formats="oneline short medium full fuller email raw format:"
+__git_log_date_formats="relative iso8601 rfc2822 short local default raw"
+
+_git_log ()
+{
+  __git_has_doubledash && return
+
+  local g="$(git rev-parse --git-dir 2>/dev/null)"
+  local merge=""
+  if [ -f "$g/MERGE_HEAD" ]; then
+    merge="--merge"
+  fi
+  case "$cur" in
+  --pretty=*|--format=*)
+    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
+      " "" "${cur#*=}"
+    return
+    ;;
+  --date=*)
+    __gitcomp "$__git_log_date_formats" "" "${cur##--date=}"
+    return
+    ;;
+  --decorate=*)
+    __gitcomp "long short" "" "${cur##--decorate=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_shortlog_options
+      $__git_log_gitk_options
+      --root --topo-order --date-order --reverse
+      --follow --full-diff
+      --abbrev-commit --abbrev=
+      --relative-date --date=
+      --pretty= --format= --oneline
+      --cherry-pick
+      --graph
+      --decorate --decorate=
+      --walk-reflogs
+      --parents --children
+      $merge
+      $__git_diff_common_options
+      --pickaxe-all --pickaxe-regex
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+__git_merge_options="
+  --no-commit --no-stat --log --no-log --squash --strategy
+  --commit --stat --no-squash --ff --no-ff --ff-only --edit --no-edit
+"
+
+_git_merge ()
+{
+  __git_complete_strategy && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "$__git_merge_options"
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_mergetool ()
+{
+  case "$cur" in
+  --tool=*)
+    __gitcomp "$__git_mergetools_common tortoisemerge" "" "${cur##--tool=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--tool="
+    return
+    ;;
+  esac
+}
+
+_git_merge_base ()
+{
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_mv ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--dry-run"
+    return
+    ;;
+  esac
+
+  if [ $(__git_count_arguments "mv") -gt 0 ]; then
+    # We need to show both cached and untracked files (including
+    # empty directories) since this may not be the last argument.
+    __git_complete_index_file "--cached --others --directory"
+  else
+    __git_complete_index_file "--cached"
+  fi
+}
+
+_git_name_rev ()
+{
+  __gitcomp "--tags --all --stdin"
+}
+
+_git_notes ()
+{
+  local subcommands='add append copy edit list prune remove show'
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+
+  case "$subcommand,$cur" in
+  ,--*)
+    __gitcomp '--ref'
+    ;;
+  ,*)
+    case "$prev" in
+    --ref)
+      __gitcomp_nl "$(__git_refs)"
+      ;;
+    *)
+      __gitcomp "$subcommands --ref"
+      ;;
+    esac
+    ;;
+  add,--reuse-message=*|append,--reuse-message=*|\
+  add,--reedit-message=*|append,--reedit-message=*)
+    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
+    ;;
+  add,--*|append,--*)
+    __gitcomp '--file= --message= --reedit-message=
+        --reuse-message='
+    ;;
+  copy,--*)
+    __gitcomp '--stdin'
+    ;;
+  prune,--*)
+    __gitcomp '--dry-run --verbose'
+    ;;
+  prune,*)
+    ;;
+  *)
+    case "$prev" in
+    -m|-F)
+      ;;
+    *)
+      __gitcomp_nl "$(__git_refs)"
+      ;;
+    esac
+    ;;
+  esac
+}
+
+_git_pull ()
+{
+  __git_complete_strategy && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --rebase --no-rebase
+      $__git_merge_options
+      $__git_fetch_options
+    "
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+_git_push ()
+{
+  case "$prev" in
+  --repo)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+  esac
+  case "$cur" in
+  --repo=*)
+    __gitcomp_nl "$(__git_remotes)" "" "${cur##--repo=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --all --mirror --tags --dry-run --force --verbose
+      --receive-pack= --repo= --set-upstream
+    "
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+_git_rebase ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir"/rebase-apply ] || [ -d "$dir"/rebase-merge ]; then
+    __gitcomp "--continue --skip --abort"
+    return
+  fi
+  __git_complete_strategy && return
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --onto --merge --strategy --interactive
+      --preserve-merges --stat --no-stat
+      --committer-date-is-author-date --ignore-date
+      --ignore-whitespace --whitespace=
+      --autosquash
+      "
+
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_reflog ()
+{
+  local subcommands="show delete expire"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+  else
+    __gitcomp_nl "$(__git_refs)"
+  fi
+}
+
+__git_send_email_confirm_options="always never auto cc compose"
+__git_send_email_suppresscc_options="author self cc bodycc sob cccmd body all"
+
+_git_send_email ()
+{
+  case "$cur" in
+  --confirm=*)
+    __gitcomp "
+      $__git_send_email_confirm_options
+      " "" "${cur##--confirm=}"
+    return
+    ;;
+  --suppress-cc=*)
+    __gitcomp "
+      $__git_send_email_suppresscc_options
+      " "" "${cur##--suppress-cc=}"
+
+    return
+    ;;
+  --smtp-encryption=*)
+    __gitcomp "ssl tls" "" "${cur##--smtp-encryption=}"
+    return
+    ;;
+  --thread=*)
+    __gitcomp "
+      deep shallow
+      " "" "${cur##--thread=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--annotate --bcc --cc --cc-cmd --chain-reply-to
+      --compose --confirm= --dry-run --envelope-sender
+      --from --identity
+      --in-reply-to --no-chain-reply-to --no-signed-off-by-cc
+      --no-suppress-from --no-thread --quiet
+      --signed-off-by-cc --smtp-pass --smtp-server
+      --smtp-server-port --smtp-encryption= --smtp-user
+      --subject --suppress-cc= --suppress-from --thread --to
+      --validate --no-validate
+      $__git_format_patch_options"
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_stage ()
+{
+  _git_add
+}
+
+__git_config_get_set_variables ()
+{
+  local prevword word config_file= c=$cword
+  while [ $c -gt 1 ]; do
+    word="${words[c]}"
+    case "$word" in
+    --system|--global|--local|--file=*)
+      config_file="$word"
+      break
+      ;;
+    -f|--file)
+      config_file="$word $prevword"
+      break
+      ;;
+    esac
+    prevword=$word
+    c=$((--c))
+  done
+
+  git --git-dir="$(__gitdir)" config $config_file --list 2>/dev/null |
+  while read -r line
+  do
+    case "$line" in
+    *.*=*)
+      echo "${line/=*/}"
+      ;;
+    esac
+  done
+}
+
+_git_config ()
+{
+  case "$prev" in
+  branch.*.remote|branch.*.pushremote)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+    ;;
+  branch.*.merge)
+    __gitcomp_nl "$(__git_refs)"
+    return
+    ;;
+  branch.*.rebase)
+    __gitcomp "false true"
+    return
+    ;;
+  remote.pushdefault)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+    ;;
+  remote.*.fetch)
+    local remote="${prev#remote.}"
+    remote="${remote%.fetch}"
+    if [ -z "$cur" ]; then
+      __gitcomp_nl "refs/heads/" "" "" ""
+      return
+    fi
+    __gitcomp_nl "$(__git_refs_remotes "$remote")"
+    return
+    ;;
+  remote.*.push)
+    local remote="${prev#remote.}"
+    remote="${remote%.push}"
+    __gitcomp_nl "$(git --git-dir="$(__gitdir)" \
+      for-each-ref --format='%(refname):%(refname)' \
+      refs/heads)"
+    return
+    ;;
+  pull.twohead|pull.octopus)
+    __git_compute_merge_strategies
+    __gitcomp "$__git_merge_strategies"
+    return
+    ;;
+  color.branch|color.diff|color.interactive|\
+  color.showbranch|color.status|color.ui)
+    __gitcomp "always never auto"
+    return
+    ;;
+  color.pager)
+    __gitcomp "false true"
+    return
+    ;;
+  color.*.*)
+    __gitcomp "
+      normal black red green yellow blue magenta cyan white
+      bold dim ul blink reverse
+      "
+    return
+    ;;
+  diff.submodule)
+    __gitcomp "log short"
+    return
+    ;;
+  help.format)
+    __gitcomp "man info web html"
+    return
+    ;;
+  log.date)
+    __gitcomp "$__git_log_date_formats"
+    return
+    ;;
+  sendemail.aliasesfiletype)
+    __gitcomp "mutt mailrc pine elm gnus"
+    return
+    ;;
+  sendemail.confirm)
+    __gitcomp "$__git_send_email_confirm_options"
+    return
+    ;;
+  sendemail.suppresscc)
+    __gitcomp "$__git_send_email_suppresscc_options"
+    return
+    ;;
+  --get|--get-all|--unset|--unset-all)
+    __gitcomp_nl "$(__git_config_get_set_variables)"
+    return
+    ;;
+  *.*)
+    return
+    ;;
+  esac
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --system --global --local --file=
+      --list --replace-all
+      --get --get-all --get-regexp
+      --add --unset --unset-all
+      --remove-section --rename-section
+      "
+    return
+    ;;
+  branch.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "remote pushremote merge mergeoptions rebase" "$pfx" "$cur_"
+    return
+    ;;
+  branch.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __gitcomp_nl "$(__git_heads)" "$pfx" "$cur_" "."
+    return
+    ;;
+  guitool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "
+      argprompt cmd confirm needsfile noconsole norescan
+      prompt revprompt revunmerged title
+      " "$pfx" "$cur_"
+    return
+    ;;
+  difftool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path" "$pfx" "$cur_"
+    return
+    ;;
+  man.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path" "$pfx" "$cur_"
+    return
+    ;;
+  mergetool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path trustExitCode" "$pfx" "$cur_"
+    return
+    ;;
+  pager.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __git_compute_all_commands
+    __gitcomp_nl "$__git_all_commands" "$pfx" "$cur_"
+    return
+    ;;
+  remote.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "
+      url proxy fetch push mirror skipDefaultUpdate
+      receivepack uploadpack tagopt pushurl
+      " "$pfx" "$cur_"
+    return
+    ;;
+  remote.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __gitcomp_nl "$(__git_remotes)" "$pfx" "$cur_" "."
+    return
+    ;;
+  url.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "insteadOf pushInsteadOf" "$pfx" "$cur_"
+    return
+    ;;
+  esac
+  __gitcomp "
+    add.ignoreErrors
+    advice.commitBeforeMerge
+    advice.detachedHead
+    advice.implicitIdentity
+    advice.pushNonFastForward
+    advice.resolveConflict
+    advice.statusHints
+    alias.
+    am.keepcr
+    apply.ignorewhitespace
+    apply.whitespace
+    branch.autosetupmerge
+    branch.autosetuprebase
+    browser.
+    clean.requireForce
+    color.branch
+    color.branch.current
+    color.branch.local
+    color.branch.plain
+    color.branch.remote
+    color.decorate.HEAD
+    color.decorate.branch
+    color.decorate.remoteBranch
+    color.decorate.stash
+    color.decorate.tag
+    color.diff
+    color.diff.commit
+    color.diff.frag
+    color.diff.func
+    color.diff.meta
+    color.diff.new
+    color.diff.old
+    color.diff.plain
+    color.diff.whitespace
+    color.grep
+    color.grep.context
+    color.grep.filename
+    color.grep.function
+    color.grep.linenumber
+    color.grep.match
+    color.grep.selected
+    color.grep.separator
+    color.interactive
+    color.interactive.error
+    color.interactive.header
+    color.interactive.help
+    color.interactive.prompt
+    color.pager
+    color.showbranch
+    color.status
+    color.status.added
+    color.status.changed
+    color.status.header
+    color.status.nobranch
+    color.status.untracked
+    color.status.updated
+    color.ui
+    commit.status
+    commit.template
+    core.abbrev
+    core.askpass
+    core.attributesfile
+    core.autocrlf
+    core.bare
+    core.bigFileThreshold
+    core.compression
+    core.createObject
+    core.deltaBaseCacheLimit
+    core.editor
+    core.eol
+    core.excludesfile
+    core.fileMode
+    core.fsyncobjectfiles
+    core.gitProxy
+    core.ignoreStat
+    core.ignorecase
+    core.logAllRefUpdates
+    core.loosecompression
+    core.notesRef
+    core.packedGitLimit
+    core.packedGitWindowSize
+    core.pager
+    core.preferSymlinkRefs
+    core.preloadindex
+    core.quotepath
+    core.repositoryFormatVersion
+    core.safecrlf
+    core.sharedRepository
+    core.sparseCheckout
+    core.symlinks
+    core.trustctime
+    core.warnAmbiguousRefs
+    core.whitespace
+    core.worktree
+    diff.autorefreshindex
+    diff.external
+    diff.ignoreSubmodules
+    diff.mnemonicprefix
+    diff.noprefix
+    diff.renameLimit
+    diff.renames
+    diff.statGraphWidth
+    diff.submodule
+    diff.suppressBlankEmpty
+    diff.tool
+    diff.wordRegex
+    diff.algorithm
+    difftool.
+    difftool.prompt
+    fetch.recurseSubmodules
+    fetch.unpackLimit
+    format.attach
+    format.cc
+    format.headers
+    format.numbered
+    format.pretty
+    format.signature
+    format.signoff
+    format.subjectprefix
+    format.suffix
+    format.thread
+    format.to
+    gc.
+    gc.aggressiveWindow
+    gc.auto
+    gc.autopacklimit
+    gc.packrefs
+    gc.pruneexpire
+    gc.reflogexpire
+    gc.reflogexpireunreachable
+    gc.rerereresolved
+    gc.rerereunresolved
+    gitcvs.allbinary
+    gitcvs.commitmsgannotation
+    gitcvs.dbTableNamePrefix
+    gitcvs.dbdriver
+    gitcvs.dbname
+    gitcvs.dbpass
+    gitcvs.dbuser
+    gitcvs.enabled
+    gitcvs.logfile
+    gitcvs.usecrlfattr
+    guitool.
+    gui.blamehistoryctx
+    gui.commitmsgwidth
+    gui.copyblamethreshold
+    gui.diffcontext
+    gui.encoding
+    gui.fastcopyblame
+    gui.matchtrackingbranch
+    gui.newbranchtemplate
+    gui.pruneduringfetch
+    gui.spellingdictionary
+    gui.trustmtime
+    help.autocorrect
+    help.browser
+    help.format
+    http.lowSpeedLimit
+    http.lowSpeedTime
+    http.maxRequests
+    http.minSessions
+    http.noEPSV
+    http.postBuffer
+    http.proxy
+    http.sslCAInfo
+    http.sslCAPath
+    http.sslCert
+    http.sslCertPasswordProtected
+    http.sslKey
+    http.sslVerify
+    http.useragent
+    i18n.commitEncoding
+    i18n.logOutputEncoding
+    imap.authMethod
+    imap.folder
+    imap.host
+    imap.pass
+    imap.port
+    imap.preformattedHTML
+    imap.sslverify
+    imap.tunnel
+    imap.user
+    init.templatedir
+    instaweb.browser
+    instaweb.httpd
+    instaweb.local
+    instaweb.modulepath
+    instaweb.port
+    interactive.singlekey
+    log.date
+    log.decorate
+    log.showroot
+    mailmap.file
+    man.
+    man.viewer
+    merge.
+    merge.conflictstyle
+    merge.log
+    merge.renameLimit
+    merge.renormalize
+    merge.stat
+    merge.tool
+    merge.verbosity
+    mergetool.
+    mergetool.keepBackup
+    mergetool.keepTemporaries
+    mergetool.prompt
+    notes.displayRef
+    notes.rewrite.
+    notes.rewrite.amend
+    notes.rewrite.rebase
+    notes.rewriteMode
+    notes.rewriteRef
+    pack.compression
+    pack.deltaCacheLimit
+    pack.deltaCacheSize
+    pack.depth
+    pack.indexVersion
+    pack.packSizeLimit
+    pack.threads
+    pack.window
+    pack.windowMemory
+    pager.
+    pretty.
+    pull.octopus
+    pull.twohead
+    push.default
+    rebase.autosquash
+    rebase.stat
+    receive.autogc
+    receive.denyCurrentBranch
+    receive.denyDeleteCurrent
+    receive.denyDeletes
+    receive.denyNonFastForwards
+    receive.fsckObjects
+    receive.unpackLimit
+    receive.updateserverinfo
+    remote.pushdefault
+    remotes.
+    repack.usedeltabaseoffset
+    rerere.autoupdate
+    rerere.enabled
+    sendemail.
+    sendemail.aliasesfile
+    sendemail.aliasfiletype
+    sendemail.bcc
+    sendemail.cc
+    sendemail.cccmd
+    sendemail.chainreplyto
+    sendemail.confirm
+    sendemail.envelopesender
+    sendemail.from
+    sendemail.identity
+    sendemail.multiedit
+    sendemail.signedoffbycc
+    sendemail.smtpdomain
+    sendemail.smtpencryption
+    sendemail.smtppass
+    sendemail.smtpserver
+    sendemail.smtpserveroption
+    sendemail.smtpserverport
+    sendemail.smtpuser
+    sendemail.suppresscc
+    sendemail.suppressfrom
+    sendemail.thread
+    sendemail.to
+    sendemail.validate
+    showbranch.default
+    status.relativePaths
+    status.showUntrackedFiles
+    status.submodulesummary
+    submodule.
+    tar.umask
+    transfer.unpackLimit
+    url.
+    user.email
+    user.name
+    user.signingkey
+    web.browser
+    branch. remote.
+  "
+}
+
+_git_remote ()
+{
+  local subcommands="add rename remove set-head set-branches set-url show prune update"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+    return
+  fi
+
+  case "$subcommand" in
+  rename|remove|set-url|show|prune)
+    __gitcomp_nl "$(__git_remotes)"
+    ;;
+  set-head|set-branches)
+    __git_complete_remote_or_refspec
+    ;;
+  update)
+    local i c='' IFS=$'\n'
+    for i in $(git --git-dir="$(__gitdir)" config --get-regexp "remotes\..*" 2>/dev/null); do
+      i="${i#remotes.}"
+      c="$c ${i/ */}"
+    done
+    __gitcomp "$c"
+    ;;
+  *)
+    ;;
+  esac
+}
+
+_git_replace ()
+{
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_reset ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "--merge --mixed --hard --soft --patch"
+    return
+    ;;
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_revert ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--edit --mainline --no-edit --no-commit --signoff"
+    return
+    ;;
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_rm ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--cached --dry-run --ignore-unmatch --quiet"
+    return
+    ;;
+  esac
+
+  __git_complete_index_file "--cached"
+}
+
+_git_shortlog ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_shortlog_options
+      --numbered --summary
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_show ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --pretty=*|--format=*)
+    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
+      " "" "${cur#*=}"
+    return
+    ;;
+  --diff-algorithm=*)
+    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--pretty= --format= --abbrev-commit --oneline
+      $__git_diff_common_options
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+_git_show_branch ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --all --remotes --topo-order --current --more=
+      --list --independent --merge-base --no-name
+      --color --no-color
+      --sha1-name --sparse --topics --reflog
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_stash ()
+{
+  local save_opts='--keep-index --no-keep-index --quiet --patch'
+  local subcommands='save list show apply clear drop pop create branch'
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    case "$cur" in
+    --*)
+      __gitcomp "$save_opts"
+      ;;
+    *)
+      if [ -z "$(__git_find_on_cmdline "$save_opts")" ]; then
+        __gitcomp "$subcommands"
+      fi
+      ;;
+    esac
+  else
+    case "$subcommand,$cur" in
+    save,--*)
+      __gitcomp "$save_opts"
+      ;;
+    apply,--*|pop,--*)
+      __gitcomp "--index --quiet"
+      ;;
+    show,--*|drop,--*|branch,--*)
+      ;;
+    show,*|apply,*|drop,*|pop,*|branch,*)
+      __gitcomp_nl "$(git --git-dir="$(__gitdir)" stash list \
+          | sed -n -e 's/:.*//p')"
+      ;;
+    *)
+      ;;
+    esac
+  fi
+}
+
+_git_submodule ()
+{
+  __git_has_doubledash && return
+
+  local subcommands="add status init deinit update summary foreach sync"
+  if [ -z "$(__git_find_on_cmdline "$subcommands")" ]; then
+    case "$cur" in
+    --*)
+      __gitcomp "--quiet --cached"
+      ;;
+    *)
+      __gitcomp "$subcommands"
+      ;;
+    esac
+    return
+  fi
+}
+
+_git_svn ()
+{
+  local subcommands="
+    init fetch clone rebase dcommit log find-rev
+    set-tree commit-diff info create-ignore propget
+    proplist show-ignore show-externals branch tag blame
+    migrate mkdirs reset gc
+    "
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+  else
+    local remote_opts="--username= --config-dir= --no-auth-cache"
+    local fc_opts="
+      --follow-parent --authors-file= --repack=
+      --no-metadata --use-svm-props --use-svnsync-props
+      --log-window-size= --no-checkout --quiet
+      --repack-flags --use-log-author --localtime
+      --ignore-paths= --include-paths= $remote_opts
+      "
+    local init_opts="
+      --template= --shared= --trunk= --tags=
+      --branches= --stdlayout --minimize-url
+      --no-metadata --use-svm-props --use-svnsync-props
+      --rewrite-root= --prefix= --use-log-author
+      --add-author-from $remote_opts
+      "
+    local cmt_opts="
+      --edit --rmdir --find-copies-harder --copy-similarity=
+      "
+
+    case "$subcommand,$cur" in
+    fetch,--*)
+      __gitcomp "--revision= --fetch-all $fc_opts"
+      ;;
+    clone,--*)
+      __gitcomp "--revision= $fc_opts $init_opts"
+      ;;
+    init,--*)
+      __gitcomp "$init_opts"
+      ;;
+    dcommit,--*)
+      __gitcomp "
+        --merge --strategy= --verbose --dry-run
+        --fetch-all --no-rebase --commit-url
+        --revision --interactive $cmt_opts $fc_opts
+        "
+      ;;
+    set-tree,--*)
+      __gitcomp "--stdin $cmt_opts $fc_opts"
+      ;;
+    create-ignore,--*|propget,--*|proplist,--*|show-ignore,--*|\
+    show-externals,--*|mkdirs,--*)
+      __gitcomp "--revision="
+      ;;
+    log,--*)
+      __gitcomp "
+        --limit= --revision= --verbose --incremental
+        --oneline --show-commit --non-recursive
+        --authors-file= --color
+        "
+      ;;
+    rebase,--*)
+      __gitcomp "
+        --merge --verbose --strategy= --local
+        --fetch-all --dry-run $fc_opts
+        "
+      ;;
+    commit-diff,--*)
+      __gitcomp "--message= --file= --revision= $cmt_opts"
+      ;;
+    info,--*)
+      __gitcomp "--url"
+      ;;
+    branch,--*)
+      __gitcomp "--dry-run --message --tag"
+      ;;
+    tag,--*)
+      __gitcomp "--dry-run --message"
+      ;;
+    blame,--*)
+      __gitcomp "--git-format"
+      ;;
+    migrate,--*)
+      __gitcomp "
+        --config-dir= --ignore-paths= --minimize
+        --no-auth-cache --username=
+        "
+      ;;
+    reset,--*)
+      __gitcomp "--revision= --parent"
+      ;;
+    *)
+      ;;
+    esac
+  fi
+}
+
+_git_tag ()
+{
+  local i c=1 f=0
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    -d|-v)
+      __gitcomp_nl "$(__git_tags)"
+      return
+      ;;
+    -f)
+      f=1
+      ;;
+    esac
+    ((c++))
+  done
+
+  case "$prev" in
+  -m|-F)
+    ;;
+  -*|tag)
+    if [ $f = 1 ]; then
+      __gitcomp_nl "$(__git_tags)"
+    fi
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+_git_whatchanged ()
+{
+  _git_log
+}
+
+__git_main ()
+{
+  local i c=1 command __git_dir
+
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    --git-dir=*) __git_dir="${i#--git-dir=}" ;;
+    --git-dir)   ((c++)) ; __git_dir="${words[c]}" ;;
+    --bare)      __git_dir="." ;;
+    --help) command="help"; break ;;
+    -c|--work-tree|--namespace) ((c++)) ;;
+    -*) ;;
+    *) command="$i"; break ;;
+    esac
+    ((c++))
+  done
+
+  if [ -z "$command" ]; then
+    case "$cur" in
+    --*)   __gitcomp "
+      --paginate
+      --no-pager
+      --git-dir=
+      --bare
+      --version
+      --exec-path
+      --exec-path=
+      --html-path
+      --man-path
+      --info-path
+      --work-tree=
+      --namespace=
+      --no-replace-objects
+      --help
+      "
+      ;;
+    *)     __git_compute_porcelain_commands
+           __gitcomp "$__git_porcelain_commands $(__git_aliases)" ;;
+    esac
+    return
+  fi
+
+  local completion_func="_git_${command//-/_}"
+  declare -f $completion_func >/dev/null && $completion_func && return
+
+  local expansion=$(__git_aliased_command "$command")
+  if [ -n "$expansion" ]; then
+    completion_func="_git_${expansion//-/_}"
+    declare -f $completion_func >/dev/null && $completion_func
+  fi
+}
+
+__gitk_main ()
+{
+  __git_has_doubledash && return
+
+  local g="$(__gitdir)"
+  local merge=""
+  if [ -f "$g/MERGE_HEAD" ]; then
+    merge="--merge"
+  fi
+  case "$cur" in
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_gitk_options
+      $merge
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+if [[ -n ${ZSH_VERSION-} ]]; then
+  echo "WARNING: this script is deprecated, please see git-completion.zsh" 1>&2
+
+  autoload -U +X compinit && compinit
+
+  __gitcomp ()
+  {
+    emulate -L zsh
+
+    local cur_="${3-$cur}"
+
+    case "$cur_" in
+    --*=)
+      ;;
+    *)
+      local c IFS=$' \t\n'
+      local -a array
+      for c in ${=1}; do
+        c="$c${4-}"
+        case $c in
+        --*=*|*.) ;;
+        *) c="$c " ;;
+        esac
+        array[$#array+1]="$c"
+      done
+      compset -P '*[=:]'
+      compadd -Q -S '' -p "${2-}" -a -- array && _ret=0
+      ;;
+    esac
+  }
+
+  __gitcomp_nl ()
+  {
+    emulate -L zsh
+
+    local IFS=$'\n'
+    compset -P '*[=:]'
+    compadd -Q -S "${4- }" -p "${2-}" -- ${=1} && _ret=0
+  }
+
+  __gitcomp_file ()
+  {
+    emulate -L zsh
+
+    local IFS=$'\n'
+    compset -P '*[=:]'
+    compadd -Q -p "${2-}" -f -- ${=1} && _ret=0
+  }
+
+  _git ()
+  {
+    local _ret=1 cur cword prev
+    cur=${words[CURRENT]}
+    prev=${words[CURRENT-1]}
+    let cword=CURRENT-1
+    emulate ksh -c __${service}_main
+    let _ret && _default && _ret=0
+    return _ret
+  }
+
+  compdef _git git gitk
+  return
+fi
+
+__git_func_wrap ()
+{
+  local cur words cword prev
+  _get_comp_words_by_ref -n =: cur words cword prev
+  $1
+}
+
+# Setup completion for certain functions defined above by setting common
+# variables and workarounds.
+# This is NOT a public function; use at your own risk.
+__git_complete ()
+{
+  local wrapper="__git_wrap${2}"
+  eval "$wrapper () { __git_func_wrap $2 ; }"
+  complete -o bashdefault -o default -o nospace -F $wrapper $1 2>/dev/null \
+    || complete -o default -o nospace -F $wrapper $1
+}
+
+# wrapper for backwards compatibility
+_git ()
+{
+  __git_wrap__git_main
+}
+
+# wrapper for backwards compatibility
+_gitk ()
+{
+  __git_wrap__gitk_main
+}
+
+__git_complete git __git_main
+__git_complete gitk __gitk_main
+
+# The following are necessary only for Cygwin, and only are needed
+# when the user has tab-completed the executable name and consequently
+# included the '.exe' suffix.
+#
+if [ Cygwin = "$(uname -o 2>/dev/null)" ]; then
+__git_complete git.exe __git_main
+fi
diff --git a/paddle/scripts/docker/root/.scripts/git-prompt.sh b/paddle/scripts/docker/root/.scripts/git-prompt.sh
new file mode 100755
index 0000000000000000000000000000000000000000..576f4ec14c94a24ebffa9e2620acf881e6b5ddaa
--- /dev/null
+++ b/paddle/scripts/docker/root/.scripts/git-prompt.sh
@@ -0,0 +1,445 @@
+# bash/zsh git prompt support
+#
+# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
+# Distributed under the GNU General Public License, version 2.0.
+#
+# This script allows you to see repository status in your prompt.
+#
+# To enable:
+#
+#    1) Copy this file to somewhere (e.g. ~/.git-prompt.sh).
+#    2) Add the following line to your .bashrc/.zshrc:
+#        source ~/.git-prompt.sh
+#    3a) Change your PS1 to call __git_ps1 as
+#        command-substitution:
+#        Bash: PS1='[\u@\h \W$(__git_ps1 " (%s)")]\$ '
+#        ZSH:  setopt PROMPT_SUBST ; PS1='[%n@%m %c$(__git_ps1 " (%s)")]\$ '
+#        the optional argument will be used as format string.
+#    3b) Alternatively, for a slightly faster prompt, __git_ps1 can
+#        be used for PROMPT_COMMAND in Bash or for precmd() in Zsh
+#        with two parameters, <pre> and <post>, which are strings
+#        you would put in $PS1 before and after the status string
+#        generated by the git-prompt machinery.  e.g.
+#        Bash: PROMPT_COMMAND='__git_ps1 "\u@\h:\w" "\\\$ "'
+#          will show username, at-sign, host, colon, cwd, then
+#          various status string, followed by dollar and SP, as
+#          your prompt.
+#        ZSH:  precmd () { __git_ps1 "%n" ":%~$ " "|%s" }
+#          will show username, pipe, then various status string,
+#          followed by colon, cwd, dollar and SP, as your prompt.
+#        Optionally, you can supply a third argument with a printf
+#        format string to finetune the output of the branch status
+#
+# The repository status will be displayed only if you are currently in a
+# git repository. The %s token is the placeholder for the shown status.
+#
+# The prompt status always includes the current branch name.
+#
+# In addition, if you set GIT_PS1_SHOWDIRTYSTATE to a nonempty value,
+# unstaged (*) and staged (+) changes will be shown next to the branch
+# name.  You can configure this per-repository with the
+# bash.showDirtyState variable, which defaults to true once
+# GIT_PS1_SHOWDIRTYSTATE is enabled.
+#
+# You can also see if currently something is stashed, by setting
+# GIT_PS1_SHOWSTASHSTATE to a nonempty value. If something is stashed,
+# then a '$' will be shown next to the branch name.
+#
+# If you would like to see if there're untracked files, then you can set
+# GIT_PS1_SHOWUNTRACKEDFILES to a nonempty value. If there're untracked
+# files, then a '%' will be shown next to the branch name.  You can
+# configure this per-repository with the bash.showUntrackedFiles
+# variable, which defaults to true once GIT_PS1_SHOWUNTRACKEDFILES is
+# enabled.
+#
+# If you would like to see the difference between HEAD and its upstream,
+# set GIT_PS1_SHOWUPSTREAM="auto".  A "<" indicates you are behind, ">"
+# indicates you are ahead, "<>" indicates you have diverged and "="
+# indicates that there is no difference. You can further control
+# behaviour by setting GIT_PS1_SHOWUPSTREAM to a space-separated list
+# of values:
+#
+#     verbose       show number of commits ahead/behind (+/-) upstream
+#     legacy        don't use the '--count' option available in recent
+#                   versions of git-rev-list
+#     git           always compare HEAD to @{upstream}
+#     svn           always compare HEAD to your SVN upstream
+#
+# By default, __git_ps1 will compare HEAD to your SVN upstream if it can
+# find one, or @{upstream} otherwise.  Once you have set
+# GIT_PS1_SHOWUPSTREAM, you can override it on a per-repository basis by
+# setting the bash.showUpstream config variable.
+#
+# If you would like to see more information about the identity of
+# commits checked out as a detached HEAD, set GIT_PS1_DESCRIBE_STYLE
+# to one of these values:
+#
+#     contains      relative to newer annotated tag (v1.6.3.2~35)
+#     branch        relative to newer tag or branch (master~4)
+#     describe      relative to older annotated tag (v1.6.3.1-13-gdd42c2f)
+#     default       exactly matching tag
+#
+# If you would like a colored hint about the current dirty state, set
+# GIT_PS1_SHOWCOLORHINTS to a nonempty value. The colors are based on
+# the colored output of "git status -sb" and are available only when
+# using __git_ps1 for PROMPT_COMMAND or precmd.
+
+# stores the divergence from upstream in $p
+# used by GIT_PS1_SHOWUPSTREAM
+__git_ps1_show_upstream ()
+{
+  local key value
+  local svn_remote svn_url_pattern count n
+  local upstream=git legacy="" verbose=""
+
+  svn_remote=()
+  # get some config options from git-config
+  local output="$(git config -z --get-regexp '^(svn-remote\..*\.url|bash\.showupstream)$' 2>/dev/null | tr '\0\n' '\n ')"
+  while read -r key value; do
+    case "$key" in
+    bash.showupstream)
+      GIT_PS1_SHOWUPSTREAM="$value"
+      if [[ -z "${GIT_PS1_SHOWUPSTREAM}" ]]; then
+        p=""
+        return
+      fi
+      ;;
+    svn-remote.*.url)
+      svn_remote[$((${#svn_remote[@]} + 1))]="$value"
+      svn_url_pattern+="\\|$value"
+      upstream=svn+git # default upstream is SVN if available, else git
+      ;;
+    esac
+  done <<< "$output"
+
+  # parse configuration values
+  for option in ${GIT_PS1_SHOWUPSTREAM}; do
+    case "$option" in
+    git|svn) upstream="$option" ;;
+    verbose) verbose=1 ;;
+    legacy)  legacy=1  ;;
+    esac
+  done
+
+  # Find our upstream
+  case "$upstream" in
+  git)    upstream="@{upstream}" ;;
+  svn*)
+    # get the upstream from the "git-svn-id: ..." in a commit message
+    # (git-svn uses essentially the same procedure internally)
+    local -a svn_upstream
+    svn_upstream=($(git log --first-parent -1 \
+          --grep="^git-svn-id: \(${svn_url_pattern#??}\)" 2>/dev/null))
+    if [[ 0 -ne ${#svn_upstream[@]} ]]; then
+      svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
+      svn_upstream=${svn_upstream%@*}
+      local n_stop="${#svn_remote[@]}"
+      for ((n=1; n <= n_stop; n++)); do
+        svn_upstream=${svn_upstream#${svn_remote[$n]}}
+      done
+
+      if [[ -z "$svn_upstream" ]]; then
+        # default branch name for checkouts with no layout:
+        upstream=${GIT_SVN_ID:-git-svn}
+      else
+        upstream=${svn_upstream#/}
+      fi
+    elif [[ "svn+git" = "$upstream" ]]; then
+      upstream="@{upstream}"
+    fi
+    ;;
+  esac
+
+  # Find how many commits we are ahead/behind our upstream
+  if [[ -z "$legacy" ]]; then
+    count="$(git rev-list --count --left-right \
+        "$upstream"...HEAD 2>/dev/null)"
+  else
+    # produce equivalent output to --count for older versions of git
+    local commits
+    if commits="$(git rev-list --left-right "$upstream"...HEAD 2>/dev/null)"
+    then
+      local commit behind=0 ahead=0
+      for commit in $commits
+      do
+        case "$commit" in
+        "<"*) ((behind++)) ;;
+        *)    ((ahead++))  ;;
+        esac
+      done
+      count="$behind  $ahead"
+    else
+      count=""
+    fi
+  fi
+
+  # calculate the result
+  if [[ -z "$verbose" ]]; then
+    case "$count" in
+    "") # no upstream
+      p="" ;;
+    "0  0") # equal to upstream
+      p="=" ;;
+    "0  "*) # ahead of upstream
+      p=">" ;;
+    *"  0") # behind upstream
+      p="<" ;;
+    *)      # diverged from upstream
+      p="<>" ;;
+    esac
+  else
+    case "$count" in
+    "") # no upstream
+      p="" ;;
+    "0  0") # equal to upstream
+      p=" u=" ;;
+    "0  "*) # ahead of upstream
+      p=" u+${count#0 }" ;;
+    *"  0") # behind upstream
+      p=" u-${count%  0}" ;;
+    *)      # diverged from upstream
+      p=" u+${count#* }-${count%  *}" ;;
+    esac
+  fi
+
+}
+
+# Helper function that is meant to be called from __git_ps1.  It
+# injects color codes into the appropriate gitstring variables used
+# to build a gitstring.
+__git_ps1_colorize_gitstring ()
+{
+  if [[ -n ${ZSH_VERSION-} ]]; then
+    local c_red='%F{red}'
+    local c_green='%F{green}'
+    local c_lblue='%F{blue}'
+    local c_clear='%f'
+  else
+    # Using \[ and \] around colors is necessary to prevent
+    # issues with command line editing/browsing/completion!
+    local c_red='\[\e[31m\]'
+    local c_green='\[\e[32m\]'
+    local c_lblue='\[\e[1;34m\]'
+    local c_clear='\[\e[0m\]'
+  fi
+  local bad_color=$c_red
+  local ok_color=$c_green
+  local flags_color="$c_lblue"
+
+  local branch_color=""
+  if [ $detached = no ]; then
+    branch_color="$ok_color"
+  else
+    branch_color="$bad_color"
+  fi
+  c="$branch_color$c"
+
+  z="$c_clear$z"
+  if [ "$w" = "*" ]; then
+    w="$bad_color$w"
+  fi
+  if [ -n "$i" ]; then
+    i="$ok_color$i"
+  fi
+  if [ -n "$s" ]; then
+    s="$flags_color$s"
+  fi
+  if [ -n "$u" ]; then
+    u="$bad_color$u"
+  fi
+  r="$c_clear$r"
+}
+
+# __git_ps1 accepts 0 or 1 arguments (i.e., format string)
+# when called from PS1 using command substitution
+# in this mode it prints text to add to bash PS1 prompt (includes branch name)
+#
+# __git_ps1 requires 2 or 3 arguments when called from PROMPT_COMMAND (pc)
+# in that case it _sets_ PS1. The arguments are parts of a PS1 string.
+# when two arguments are given, the first is prepended and the second appended
+# to the state string when assigned to PS1.
+# The optional third parameter will be used as printf format string to further
+# customize the output of the git-status string.
+# In this mode you can request colored hints using GIT_PS1_SHOWCOLORHINTS=true
+__git_ps1 ()
+{
+  local pcmode=no
+  local detached=no
+  local ps1pc_start='\u@\h:\w '
+  local ps1pc_end='\$ '
+  local printf_format=' (%s)'
+
+  case "$#" in
+    2|3)  pcmode=yes
+      ps1pc_start="$1"
+      ps1pc_end="$2"
+      printf_format="${3:-$printf_format}"
+    ;;
+    0|1)  printf_format="${1:-$printf_format}"
+    ;;
+    *)  return
+    ;;
+  esac
+
+  local repo_info rev_parse_exit_code
+  repo_info="$(git rev-parse --git-dir --is-inside-git-dir \
+    --is-bare-repository --is-inside-work-tree \
+    --short HEAD 2>/dev/null)"
+  rev_parse_exit_code="$?"
+
+  if [ -z "$repo_info" ]; then
+    if [ $pcmode = yes ]; then
+      #In PC mode PS1 always needs to be set
+      PS1="$ps1pc_start$ps1pc_end"
+    fi
+    return
+  fi
+
+  local short_sha
+  if [ "$rev_parse_exit_code" = "0" ]; then
+    short_sha="${repo_info##*$'\n'}"
+    repo_info="${repo_info%$'\n'*}"
+  fi
+  local inside_worktree="${repo_info##*$'\n'}"
+  repo_info="${repo_info%$'\n'*}"
+  local bare_repo="${repo_info##*$'\n'}"
+  repo_info="${repo_info%$'\n'*}"
+  local inside_gitdir="${repo_info##*$'\n'}"
+  local g="${repo_info%$'\n'*}"
+
+  local r=""
+  local b=""
+  local step=""
+  local total=""
+  if [ -d "$g/rebase-merge" ]; then
+    read b 2>/dev/null <"$g/rebase-merge/head-name"
+    read step 2>/dev/null <"$g/rebase-merge/msgnum"
+    read total 2>/dev/null <"$g/rebase-merge/end"
+    if [ -f "$g/rebase-merge/interactive" ]; then
+      r="|REBASE-i"
+    else
+      r="|REBASE-m"
+    fi
+  else
+    if [ -d "$g/rebase-apply" ]; then
+      read step 2>/dev/null <"$g/rebase-apply/next"
+      read total 2>/dev/null <"$g/rebase-apply/last"
+      if [ -f "$g/rebase-apply/rebasing" ]; then
+        read b 2>/dev/null <"$g/rebase-apply/head-name"
+        r="|REBASE"
+      elif [ -f "$g/rebase-apply/applying" ]; then
+        r="|AM"
+      else
+        r="|AM/REBASE"
+      fi
+    elif [ -f "$g/MERGE_HEAD" ]; then
+      r="|MERGING"
+    elif [ -f "$g/CHERRY_PICK_HEAD" ]; then
+      r="|CHERRY-PICKING"
+    elif [ -f "$g/REVERT_HEAD" ]; then
+      r="|REVERTING"
+    elif [ -f "$g/BISECT_LOG" ]; then
+      r="|BISECTING"
+    fi
+
+    if [ -n "$b" ]; then
+      :
+    elif [ -h "$g/HEAD" ]; then
+      # symlink symbolic ref
+      b="$(git symbolic-ref HEAD 2>/dev/null)"
+    else
+      local head=""
+      if ! read head 2>/dev/null <"$g/HEAD"; then
+        if [ $pcmode = yes ]; then
+          PS1="$ps1pc_start$ps1pc_end"
+        fi
+        return
+      fi
+      # is it a symbolic ref?
+      b="${head#ref: }"
+      if [ "$head" = "$b" ]; then
+        detached=yes
+        b="$(
+        case "${GIT_PS1_DESCRIBE_STYLE-}" in
+        (contains)
+          git describe --contains HEAD ;;
+        (branch)
+          git describe --contains --all HEAD ;;
+        (describe)
+          git describe HEAD ;;
+        (* | default)
+          git describe --tags --exact-match HEAD ;;
+        esac 2>/dev/null)" ||
+
+        b="$short_sha..."
+        b="($b)"
+      fi
+    fi
+  fi
+
+  if [ -n "$step" ] && [ -n "$total" ]; then
+    r="$r $step/$total"
+  fi
+
+  local w=""
+  local i=""
+  local s=""
+  local u=""
+  local c=""
+  local p=""
+
+  if [ "true" = "$inside_gitdir" ]; then
+    if [ "true" = "$bare_repo" ]; then
+      c="BARE:"
+    else
+      b="GIT_DIR!"
+    fi
+  elif [ "true" = "$inside_worktree" ]; then
+    if [ -n "${GIT_PS1_SHOWDIRTYSTATE-}" ] &&
+       [ "$(git config --bool bash.showDirtyState)" != "false" ]
+    then
+      git diff --no-ext-diff --quiet --exit-code || w="*"
+      if [ -n "$short_sha" ]; then
+        git diff-index --cached --quiet HEAD -- || i="+"
+      else
+        i="#"
+      fi
+    fi
+    if [ -n "${GIT_PS1_SHOWSTASHSTATE-}" ] &&
+       [ -r "$g/refs/stash" ]; then
+      s="$"
+    fi
+
+    if [ -n "${GIT_PS1_SHOWUNTRACKEDFILES-}" ] &&
+       [ "$(git config --bool bash.showUntrackedFiles)" != "false" ] &&
+       git ls-files --others --exclude-standard --error-unmatch -- '*' >/dev/null 2>/dev/null
+    then
+      u="%${ZSH_VERSION+%}"
+    fi
+
+    if [ -n "${GIT_PS1_SHOWUPSTREAM-}" ]; then
+      __git_ps1_show_upstream
+    fi
+  fi
+
+  local z="${GIT_PS1_STATESEPARATOR-" "}"
+
+  # NO color option unless in PROMPT_COMMAND mode
+  if [ $pcmode = yes ] && [ -n "${GIT_PS1_SHOWCOLORHINTS-}" ]; then
+    __git_ps1_colorize_gitstring
+  fi
+
+  local f="$w$i$s$u"
+  local gitstring="$c${b##refs/heads/}${f:+$z$f}$r$p"
+
+  if [ $pcmode = yes ]; then
+    if [[ -n ${ZSH_VERSION-} ]]; then
+      gitstring=$(printf -- "$printf_format" "$gitstring")
+    else
+      printf -v gitstring -- "$printf_format" "$gitstring"
+    fi
+    PS1="$ps1pc_start$gitstring$ps1pc_end"
+  else
+    printf -- "$printf_format" "$gitstring"
+  fi
+}
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 283fd34a6d8a2268f3800ec69920e128ac75e7dc..f29d32f0d947dc7cde6112160e4f79ce8113505f 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -122,6 +122,9 @@ case "$1" in
     "make_diagram")
         python -m paddle.utils.make_model_diagram ${@:2}
         ;;
+    "usage")
+        $MYDIR/../opt/paddle/bin/paddle_usage ${@:2}
+        ;;
     "version")
         version
         ;;
diff --git a/paddle/scripts/tools/usage_stat/usage.sh b/paddle/scripts/tools/usage_stat/usage.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7dbd1f58842f50ea1df0e2476c4a493569b1dda9
--- /dev/null
+++ b/paddle/scripts/tools/usage_stat/usage.sh
@@ -0,0 +1,168 @@
+#!/bin/bash
+
+ARGPARSE=`getopt -o u:vin:l:e: --long git-user:,help,dry-run,task-name:,log-file:,exit-code:  -- "$@"`
+KEEP_ANONYMOUS="A_USER_DOES_NOT_TELL_US"
+# paddle config home dir, same as paddle
+PADDLE_CONF_HOME="$HOME/.config/paddle"
+# api url, mirror url(s) will be append later
+PD_URLS="http://api.paddlepaddle.org/version"
+
+usage()
+{
+    echo "Usage: `basename $0` [options]"
+    echo "Options:"
+    echo "  -e, --exit-code=EXIT_CODE         The train/predict process's exit code"
+    echo "  -l, --log-file=LOG_FILE_PATH      Read which log file to get the duration of process"
+    echo "  -n, --task-name=TASK_NAME         The name of demo or example"
+    echo "  -u, --git-user=GITHUB_USER        provide contact info, like username or email"
+    echo "  -v, -i                            Verbose output and interact with user when necessary"
+    echo " --help                             display this help message"
+}
+
+eval set -- "${ARGPARSE}"
+while true; do
+    case "$1" in
+        -l|--log-file)
+            log_file=$2
+            shift 2
+            ;;
+        -e|--exit-code)
+            exit_code=$2
+            shift 2
+            ;;
+        -u|--git-user)
+            github_user=$2
+            shift 2
+            ;;
+        -n|--task-name)
+            task=$2
+            shift 2
+            ;;
+        -v|-i)
+            v=1
+            shift
+            ;;
+        --dry-run)
+            dry_run=1
+            shift
+            ;;
+        --)
+            shift
+            break
+            ;;
+        --help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Invalid option $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# parse the log_file to get the time costs
+if [ -s "${log_file}" ]; then
+    duration=`awk 'BEGIN{day=0;last_sec=0;min_sec=0;max_sec=0;}
+    {if(index($2,":")==3){
+        t=substr($2,1,8);
+        sec=day*86400+substr(t,1,2)*3600+substr(t,4,2)*60+substr(t,7,2);
+        if(sec<last_sec-600){day+=1;sec+=86400;}
+        last_sec=sec;
+        if(min_sec==0 || min_sec>sec){min_sec=sec;}
+        if(max_sec==0 || max_sec<sec){max_sec=sec;}
+    }}
+    END{print max_sec-min_sec}' ${log_file}`
+else
+    duration=-1
+fi
+if [ "${v}" = "1" ]; then echo "duration: ${duration}"; fi
+
+# try find the user/email if not given
+if [ -z "${github_user}" ]; then
+    # search for cached username
+    if [ -s "${PADDLE_CONF_HOME}/github_user" ]; then
+        if [ "${v}" = "1" ]; then echo "read github_user from cache..."; fi
+        github_user=`cat ${PADDLE_CONF_HOME}/github_user`
+    else
+        # search the github-user from git config
+        if [ "${v}" = "1" ]; then echo "read github_user from git..."; fi
+        git_username=`git config --get user.name 2>/dev/null`
+        git_url=`git config --get remote.origin.url 2>/dev/null`
+        if [ "`echo ${git_url} | cut -b 1-19`" = "https://github.com/" ]; then
+            # under a git url, like https://github.com/user_xxx/proj_yyy.git
+            if [ "${v}" = "1" ]; then echo " from github url..."; fi
+            github_user=`echo ${git_url} | cut -d "/" -f 4`
+            if [ "${github_user}" = "PaddlePaddle" ]; then
+                github_user=
+            fi
+        fi
+        if [ -n "${git_username}" -a -z "${github_user}" ]; then
+            if [ "${v}" = "1" ]; then echo " from global git username..."; fi
+            github_user=${git_username}
+        fi
+    fi
+fi
+# allow user to set the user name, if it's not found
+if [ -z "${github_user}" -a "${v}" = "1" ]; then
+    read -p "Please input your github username or email, or just return to keep this feedback anonymous:"
+    github_user=${REPLY}
+    if [ -z "${github_user}" ]; then
+        # empty input, consider as one anonymous user
+        github_user="${KEEP_ANONYMOUS}"
+    fi
+fi
+if [ -n "${github_user}" -a -z "${dry_run}" ]; then
+    # valid user and not in dry-run mode, then save to cache
+    mkdir -p ${PADDLE_CONF_HOME}
+    echo "${github_user}" >${PADDLE_CONF_HOME}/github_user
+fi
+if [ "${v}" = "1" ]; then echo "username: ${github_user}"; fi
+if [ "${github_user}" = "${KEEP_ANONYMOUS}" ]; then
+    # anonymous user should keep the var empty.
+    github_user=
+fi
+
+# read local paddle version
+paddle_version=`paddle version | grep PaddlePaddle | head -n1 | cut -d " " -f 2 | cut -d "," -f 1`
+if [ "${v}" = "1" ]; then echo "version:${paddle_version}"; fi
+
+# read local system time
+system_time=`date "+%Y%m%d%H%M%S"`
+if [ "${v}" = "1" ]; then echo "system time:${system_time}"; fi
+
+# make empty job_name as default value.
+if [ -z "${task}" ]; then
+    task="(unknown_task)"
+fi
+if [ "${v}" = "1" ]; then echo "task: ${task}"; fi
+
+# concat the curl command
+params="content={\"data_type\":\"usage\",\
+\"system_time\":${system_time},\"paddle_version\":\"${paddle_version}\",\
+\"github_user\":\"${github_user}\",\"job_name\":\"${task}\",\
+\"duration\":${duration},\"exit_code\":\"${exit_code}\"\
+}&type=1"
+curl_cmd_prefix="curl -m 5 -X POST -d ${params}\
+ -b ${PADDLE_CONF_HOME}/paddle.cookie -c ${PADDLE_CONF_HOME}/paddle.cookie "
+
+if [ "${dry_run}" = "1" ]; then
+    first_url=`echo ${PD_URLS} | cut -d " " -f 1`
+    echo "(dry-run mode)curl command: ${curl_cmd_prefix} ${first_url}"
+    exit 0
+else
+    for u in ${PD_URLS}; do
+        curl_cmd="${curl_cmd_prefix} ${u}"
+        if [ "${v}" = "1" ]; then echo "run: ${curl_cmd}"; fi
+        ${curl_cmd} >/dev/null 2>&1
+        if [ $? -eq 0 ]; then
+            if [ "${v}" = "1" ]; then echo "upload OK!"; fi
+            exit 0
+        else
+            if [ "${v}" = "1" ]; then echo "upload failed...try next"; fi
+        fi
+    done
+    if [ "${v}" = "1" ]; then echo "all urls tried but all failed...exit"; fi
+    exit 1
+fi
diff --git a/paddle/scripts/travis/before_install.linux.sh b/paddle/scripts/travis/before_install.linux.sh
deleted file mode 100755
index 9620bff6bcf77c6e87f149e8e33408170dd8e507..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/before_install.linux.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-set -e
-pip install protobuf
-cd /tmp
-wget https://github.com/google/protobuf/archive/v3.0.2.tar.gz -O protobuf.tar.gz
-tar xf protobuf.tar.gz
-cd protobuf*
-./autogen.sh
-./configure --prefix=/usr/
-make -j 2 install
-cd ..
-rm -rf protobuf*
-
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
deleted file mode 100755
index bd88ed39132f19ca7cfc4f0dd6acdbc6b83e94ab..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/before_install.osx.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-brew update
-brew tap homebrew/science
-brew install python
-sudo pip install --upgrade protobuf
-brew install cmake python glog gflags openblas wget md5sha1sum protobuf
-
-wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
-tar xf gtest.tar.gz
-cd googletest-release-1.8.0/
-cmake .
-make install
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 9caeb21beb15ee5281f9a6aefcfd59b94b91e48a..7deb3e62e88de7e1306fcbfc5a28aa4372d678e6 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,27 +1,12 @@
 #!/bin/bash
-./build_submodules.sh
 source ./common.sh
-CMAKE_EXTRA=""
-if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
-else
-  CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
-fi
-
-
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
 
 NPROC=1
-if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
-  NRPOC=`nproc`
-  make -j $NPROC
-  make coveralls
-elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  NPROC=`sysctl -n hw.ncpu`
-  make -j $NPROC
-  env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
-fi
-
-
+export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
+export PYTHONHOME=/opt/python/2.7.12
+export PATH=/opt/python/2.7.12/bin:${PATH}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+NRPOC=`nproc`
+make -j $NPROC
+make coveralls
 sudo make install
-sudo paddle version
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
deleted file mode 100755
index d458bf92bf455609de601c60402101d09765dfe4..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_submodules.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-WORK_DIR=$PWD
-PROJ_ROOT=$(git rev-parse --show-cdup)
-SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
-
-for module in $SUBMODULES
-do
-  case $module in
-    "warp-ctc")
-      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
-        rm -rf ${PROJ_ROOT}warp-ctc/build
-      fi
-      mkdir ${PROJ_ROOT}warp-ctc/build
-      cd ${PROJ_ROOT}warp-ctc/build
-      cmake ..; make
-    ;;
-  esac
-done
-cd $WORK_DIR
diff --git a/paddle/scripts/travis/common.sh b/paddle/scripts/travis/common.sh
index 9b6e420ca7931f0d17da461c7579bf4dc69e18e0..f05c7530a3b0632948e4b18c477d6dc6aad04c03 100755
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
@@ -2,3 +2,5 @@
 set -e
 mkdir -p ../../../build
 cd ../../../build
+mkdir -p $HOME/third_party
+EXTRA_CMAKE_OPTS="-DTHIRD_PARTY_PATH=${HOME}/third_party"
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 8690fe1d40c935e119fefbc02f3a228d76d8c0f9..53e998ef6c1b96d9e7d82b7effd12a27e6dc69f2 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -2,9 +2,13 @@
 
 # Add set -e, cd to directory.
 source ./common.sh
-
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
+mkdir output
+make DESTDIR=./output install -j `nproc`
+pip install ./output/usr/local/opt/paddle/share/wheels/*
+rm -rf *
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links
@@ -25,26 +29,41 @@ TARGET_BRANCH="gh-pages"
 # Only deploy master branch to build latest documentation.
 SOURCE_BRANCH="master"
 
-# If is not a Github pull request, and in master branch.
-if [ "$TRAVIS_PULL_REQUEST" != "false" -o "$TRAVIS_BRANCH" != "$SOURCE_BRANCH"  ]; then
-  exit 0
-fi
-
 # Clone the repo to output directory
 git clone $REPO output
 cd output
 
-# checkout github page branch
-git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+function deploy_docs() {
+  SOURCE_BRANCH=$1
+  DIR=$2
+  # If is not a Github pull request
+  if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then
+    exit 0
+  fi
+  # If it is not watched branch.
+  if [ "$TRAVIS_BRANCH" != "$SOURCE_BRANCH" ]; then
+    return
+  fi
 
-# remove old docs. mv new docs.
-rm -rf doc doc_cn
-mv ../doc/cn/html doc_cn
-mv ../doc/en/html doc
+  # checkout github page branch
+  git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+  
+  mkdir -p ${DIR}
+  # remove old docs. mv new docs.
+  set +e
+  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  set -e
+  mv ../doc/cn/html ${DIR}/doc_cn
+  mv ../doc/en/html ${DIR}/doc
+  git add .
+}
+
+deploy_docs "master" "." 
+deploy_docs "develop" "./develop/"
 
 # Check is there anything changed.
 set +e
-git diff --exit-code >/dev/null
+git diff --cached --exit-code >/dev/null
 if [ $? -eq 0 ]; then
   echo "No changes to the output on this push; exiting."
   exit 0
@@ -57,7 +76,6 @@ if [ -n $SSL_KEY ]; then  # Only push updated docs for github.com/PaddlePaddle/P
   git config user.name "Travis CI"
   git config user.email "paddle-dev@baidu.com"
   git commit -m "Deploy to GitHub Pages: ${SHA}"
-
   # Set ssh private key
   openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
   chmod 600 deploy_key
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 464ad632868bd1fd4d88547212421302ca0b2116..382d5be6ecfc26b4a524bb6a775bd1a805a34d96 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -14,7 +14,9 @@
 
 # This file is used to build paddle python binding package.
 # It will be invoked by Makefile that generated by COMAKE
+
 from setuptools import setup, Extension
+
 import numpy as np
 import api.paddle_ld_flags
 import platform
@@ -53,6 +55,9 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
+os.environ["CC"] = "@CMAKE_C_COMPILER@"
+os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
@@ -67,7 +72,8 @@ setup(name="py_paddle",
   packages=['py_paddle'],
   include_dirs = include_dirs,
   install_requires = [
+    'nltk>=3.2.2',
     'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=2.4.1' # The paddle protobuf version
+    'protobuf>=3.0.0'    # The paddle protobuf version
   ],
 )
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c47add04b081cbdf78b5a5d3bca3a71025b3d9ac
--- /dev/null
+++ b/paddle/testing/CMakeLists.txt
@@ -0,0 +1,8 @@
+# for paddle test case
+
+if(WITH_TESTING)
+  add_library(paddle_test_main STATIC TestMain.cpp)
+  add_dependencies(paddle_test_main gen_proto_cpp)
+  add_library(paddle_test_util STATIC TestUtil.cpp)
+  add_dependencies(paddle_test_util gen_proto_cpp)
+endif()
diff --git a/paddle/function/TestMain.cpp b/paddle/testing/TestMain.cpp
similarity index 100%
rename from paddle/function/TestMain.cpp
rename to paddle/testing/TestMain.cpp
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/testing/TestUtil.cpp
similarity index 100%
rename from paddle/gserver/tests/TestUtil.cpp
rename to paddle/testing/TestUtil.cpp
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/testing/TestUtil.h
similarity index 100%
rename from paddle/gserver/tests/TestUtil.h
rename to paddle/testing/TestUtil.h
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index c3207e63ce72b73a57c2e40c72c5259f0ae61bc9..9e9e948b8856d2712f8894b3d14db9c795d5f694 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -184,7 +184,6 @@ protected:
    * @param para
    */
   virtual void updateImpl(Parameter* para) {}
-  virtual void update(Parameter* para) {}
 };
 
 /**
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 7794b209009a3429e810074b61e1d5bffa8b3a4e..5e82c944751629632ea8d16992bd8f4178a2fbd5 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
-      int expectedPpassCount,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -146,7 +146,7 @@ protected:
   BatchStatus batchStatus_;
   /// controller thread for sync-sgd
   std::unique_ptr<std::thread> controllerThread_;
-  /// passed alread finished
+  /// passed already finished
   int64_t passCount_;
   /// expected passes to finished
   int64_t expectedPassCount_;
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 13aa28ae5d9699d267858d48e46797c756487ddd..80664fa877b324af73e3e3effa11e46eac6294e2 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
     return 0.0;  // In this case, there is no meaning to calculate cost
   }
 
-  return Argument::sumCosts(outArgs);
+  return Argument::sum(outArgs);
 }
 
 void Tester::testOnePassBatch(int passId) {
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 09e0a213ab2d71890cfafb905b5969383acfe95a..b68e29cd5ea223272151e7a8b52d998832f47103 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "Trainer.h"
 
-#include <fenv.h>
 #include <stdio.h>
 
 #include <iomanip>
@@ -24,7 +23,7 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
@@ -91,16 +90,6 @@ DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
-void Trainer::init(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
-
-  init(config);
-}
-
 void Trainer::init(const std::shared_ptr<TrainerConfigHelper>& config,
                    bool testing,
                    const std::shared_ptr<GradientMachine>& gradientMachine,
@@ -321,7 +310,7 @@ real Trainer::checkGradient() {
   std::vector<Argument> outArgs;
 
   trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
   LOG(INFO) << "original cost=" << cost;
   trainerInternal_.getGradientMachine()->backward();
 
@@ -351,7 +340,7 @@ real Trainer::checkGradient() {
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
     parameter->setValueUpdated();
     trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sumCosts(outArgs);
+    real newCost1 = Argument::sum(outArgs);
 
     for (size_t i = 0; i < dim; ++i) {
       newp[i] = oldp[i] - step * d[i];
@@ -360,7 +349,7 @@ real Trainer::checkGradient() {
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
     parameter->setValueUpdated();
     trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sumCosts(outArgs);
+    real newCost2 = Argument::sum(outArgs);
 
     real trueDelta = 0.5 * (newCost1 - newCost2);
     real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
@@ -586,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch,
 
   trainerInternal_.getGradientMachine()->forwardBackward(
       inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
 
   offset = 0;
   for (auto& para : parameters) {
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 7cbf18ace7a5fed053653c73e62d36c388b15123..c8ee4726c24c335ceda22ea3a20049b01d11c149 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -71,11 +71,6 @@ public:
       const std::shared_ptr<DataProvider>& dataProvider = nullptr,
       const std::shared_ptr<DataProvider>& testDataProvider = nullptr);
 
-  /**
-   * Initialize Trainer from command line flags.
-   */
-  void init(int argc, char** argv);
-
   /**
    * Train until num_passes reached.
    * One pass means neural network train through all training data.
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index f3b465b444167d4624a5e99c30e1257eda53ca2c..4c5d4a0913aaf3a9932b3d67806378ece4245304 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   real cost = 0;
   {
     REGISTER_TIMER("sumCost");
-    cost = Argument::sumCosts(*outArgs);
+    cost = Argument::sum(*outArgs);
   }
 
   if (batchId % intconfig_->log_period == 0) {
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 947f9cadcc983d58ce31ef462e51dc42e41eaf1b..c5c1d484e5f85c774fd4b8f1d4a8d46abfa2f547 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,14 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
-#include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/pserver/ParameterServerController.h"
 #include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/StringUtil.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
-#include "paddle/pserver/RDMANetwork.h"
 
 DEFINE_bool(start_pserver, false, "Whether to start pserver");
 DECLARE_int32(gpu_id);
@@ -39,54 +36,11 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
   initPython(argc, argv);
 
-  std::vector<std::unique_ptr<ParameterServer2>> pservers;
-  std::vector<std::string> devices;
-
+  std::unique_ptr<ParameterServerController> parameterServerPtr(nullptr);
   if (FLAGS_start_pserver) {
-    // round robin to loadbalance RDMA server ENGINE
-    int rdmaCpu = 0;
-    int onlineCpus = rdma::numCpus();
-    int numPorts = FLAGS_ports_num + FLAGS_ports_num_for_sparse;
-    if (FLAGS_nics.empty()) {
-      pservers.resize(numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        if (FLAGS_rdma_tcp == "rdma") {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i, rdmaCpu++));
-          rdmaCpu = rdmaCpu % onlineCpus;
-        } else {
-          pservers[i].reset(
-              new ParameterServer2(std::string(), FLAGS_port + i));
-        }
-
-        CHECK(pservers[i]->init()) << "Fail to initialize parameter server"
-                                   << FLAGS_port + i;
-        LOG(INFO) << "pserver started : " << FLAGS_port + i;
-        pservers[i]->start();
-      }
-    } else {
-      str::split(FLAGS_nics, ',', &devices);
-      pservers.resize(devices.size() * numPorts);
-      for (int i = 0; i < numPorts; ++i) {
-        for (size_t j = 0; j < devices.size(); ++j) {
-          if (FLAGS_rdma_tcp == "rdma") {
-            pservers[i * devices.size() + j].reset(new ParameterServer2(
-                getIpAddr(devices[j]), FLAGS_port + i, rdmaCpu++));
-            rdmaCpu = rdmaCpu % onlineCpus;
-          } else {
-            pservers[i * devices.size() + j].reset(
-                new ParameterServer2(getIpAddr(devices[j]), FLAGS_port + i));
-          }
-
-          CHECK(pservers[i * devices.size() + j]->init())
-              << "Fail to initialize parameter server" << devices[j]
-              << FLAGS_port + i;
-          LOG(INFO) << "pserver started : " << devices[j] << ":"
-                    << FLAGS_port + i;
-          pservers[i * devices.size() + j]->start();
-        }
-      }
-    }
+    parameterServerPtr.reset(
+        paddle::ParameterServerController::createFromGflags());
+    parameterServerPtr->start();
   }
   Trainer trainer;
   auto config = TrainerConfigHelper::createFromFlags();
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 28c3d6f2631f9e28e3f1ff086b1e8edf994e73a4..c5c76a030d9e5f1deed63454b408442954ef5eae 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -1,11 +1,3 @@
-################# test_Prediction ######################
-add_unittest_without_exec(test_Prediction
-    test_Prediction.cpp)
-add_test(NAME test_Prediction
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Prediction --merger=${CMAKE_CURRENT_BINARY_DIR}/../paddle_merge_model
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
-
 ################# test_Compare ############################
 add_unittest_without_exec(test_Compare
     test_Compare.cpp)
@@ -17,9 +9,10 @@ add_test(NAME test_Compare
 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
-set(diy_dll_dir ${CMAKE_CURRENT_BINARY_DIR}/../../gserver/tests)
 add_test(NAME test_Trainer
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
@@ -82,5 +75,5 @@ add_test(NAME test_PyDataProviderWrapper
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
deleted file mode 100644
index 0c79404eee1c0902c5c8e8eefd139da3da584636..0000000000000000000000000000000000000000
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/utils/PythonUtil.h>
-
-#include "paddle/trainer/Trainer.h"
-
-#include <gtest/gtest.h>
-
-DECLARE_string(config);
-DECLARE_string(config_args);
-DEFINE_string(merger,
-              "./paddle_merge_model",
-              "path to paddle_merge_model binary");
-
-using namespace paddle;  // NOLINT
-using namespace std;     // NOLINT
-
-static const string& configFile = "trainer/tests/sample_trainer_config.conf";
-static const string& mergedModelFile = "./test_model_file";
-static const string& modelDir = "./test_model_dir";
-
-void checkBuffer(real* vec1, real* vec2, size_t len) {
-  for (size_t i = 0; i < len; i++) {
-    EXPECT_EQ(vec1[i], vec2[i]) << "vec1:" << vec1[i] << " vec2:" << vec2[i];
-  }
-}
-
-void checkParameters(vector<ParameterPtr> A, vector<ParameterPtr> B) {
-  CHECK_EQ(B.size(), A.size()) << "parameter size not equal";
-  for (size_t i = 0; i < A.size(); i++) {
-    auto vec1 = A[i]->getBuf(PARAMETER_VALUE);
-    auto vec2 = B[i]->getBuf(PARAMETER_VALUE);
-    CHECK_EQ(vec1->useGpu_, vec2->useGpu_) << "use gpu not equal";
-    CHECK_EQ(vec1->getSize(), vec2->getSize()) << "size not equal";
-
-    if (vec1->useGpu_ == false) {
-      checkBuffer(vec1->getData(), vec2->getData(), vec1->getSize());
-    } else {
-      VectorPtr cpuVec1 = Vector::create(vec1->getSize(), false);
-      VectorPtr cpuVec2 = Vector::create(vec2->getSize(), false);
-      cpuVec1->copyFrom(*vec1, HPPL_STREAM_DEFAULT);
-      cpuVec2->copyFrom(*vec2, HPPL_STREAM_DEFAULT);
-      hl_stream_synchronize(HPPL_STREAM_DEFAULT);
-      checkBuffer(cpuVec1->getData(), cpuVec2->getData(), cpuVec1->getSize());
-    }
-  }
-}
-
-TEST(GradientMachine, create) {
-#ifdef PADDLE_ONLY_CPU
-  FLAGS_use_gpu = false;
-#endif
-  mkDir(modelDir.c_str());
-  FLAGS_config = configFile;
-  FLAGS_config_args = "with_cost=False";
-  auto config = TrainerConfigHelper::createFromFlagConfig();
-
-  // save model to directory
-  unique_ptr<GradientMachine> gradientMachine1(
-      GradientMachine::create(*config));
-  gradientMachine1->saveParameters(modelDir);
-  Trainer trainer;
-  trainer.init(config);
-  ParameterUtil* paramUtil = trainer.getParameterUtilPtr();
-  if (paramUtil != NULL) {
-    paramUtil->saveConfigWithPath(modelDir);
-  }
-
-  // create a different GradientMachine
-  unique_ptr<GradientMachine> gradientMachine2(
-      GradientMachine::create(*config));
-  gradientMachine2->randParameters();
-
-  // merge config and model to one file
-  string cmd = FLAGS_merger + " --model_dir=" + modelDir +
-               " --config_args=with_cost=False" + " --model_file=" +
-               mergedModelFile;
-  LOG(INFO) << cmd;
-  int ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  // create GradientMachine from the merged model
-  DataConfig dataConfig;
-  unique_ptr<GradientMachine> gradientMachine3(
-      GradientMachine::create(mergedModelFile, &dataConfig));
-  CHECK(gradientMachine3);
-  EXPECT_EQ(dataConfig.type(), "simple");
-  EXPECT_EQ(dataConfig.feat_dim(), 3);
-
-  // compare the parameters of GradientMachine and GradientMachine3
-  std::vector<ParameterPtr> paraMachine1 = gradientMachine1->getParameters();
-  std::vector<ParameterPtr> paraMachine3 = gradientMachine3->getParameters();
-  checkParameters(paraMachine1, paraMachine3);
-
-  // Test that the GradientMachine created from the merged model
-  // is same as the orginnal one.
-  vector<Argument> inArgs(1);
-  vector<Argument> outArgs;
-
-  int inputDim = 3;
-  int numSamples = 2;
-  CpuMatrix cpuInput(numSamples, inputDim);
-  for (int i = 0; i < numSamples; ++i) {
-    for (int j = 0; j < inputDim; ++j) {
-      cpuInput.getData()[i * inputDim + j] =
-          rand() / (real)RAND_MAX;  // NOLINT TODO(yuyang): use rand_r
-    }
-  }
-  MatrixPtr input = Matrix::create(numSamples,
-                                   inputDim,
-                                   /* trans */ false,
-                                   FLAGS_use_gpu);
-  input->copyFrom(cpuInput);
-  inArgs[0].value = input;
-  gradientMachine1->forward(inArgs, &outArgs, PASS_TEST);
-  EXPECT_EQ((size_t)1, outArgs.size());
-
-  vector<Argument> outArgs2;
-  gradientMachine2->forward(inArgs, &outArgs2, PASS_TEST);
-  CpuMatrix out1(outArgs[0].value->getHeight(), outArgs[0].value->getWidth());
-  CpuMatrix out2(outArgs2[0].value->getHeight(), outArgs2[0].value->getWidth());
-  out1.copyFrom(*outArgs[0].value);
-  out2.copyFrom(*outArgs2[0].value);
-  for (size_t i = 0; i < out1.getHeight() * out1.getWidth(); i++) {
-    EXPECT_NE(out1.getData()[i], out2.getData()[i]);
-  }
-
-  gradientMachine3->forward(inArgs, &outArgs2, PASS_TEST);
-  out2.copyFrom(*outArgs2[0].value);
-  checkBuffer(
-      out1.getData(), out2.getData(), out2.getHeight() * out2.getWidth());
-
-  cmd = " rm -rf " + modelDir + "/*";
-  LOG(INFO) << "cmd " << cmd;
-  ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  cmd = " rm -rf " + mergedModelFile;
-  LOG(INFO) << "cmd " << cmd;
-  ret = system(cmd.c_str());
-  EXPECT_EQ(0, ret);
-  if (ret) {
-    return;
-  }
-
-  // clean up
-  rmDir(modelDir.c_str());
-  remove(mergedModelFile.c_str());
-}
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  initPython(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 371282dd6bb9a995bc6ae8b2a5bd708f831d7e33..264bc46ebcd0aa17fd605e537fcb2c316ef31162 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -96,11 +96,6 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
-#else
-  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
-#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
index f2cfd7409412de68f4183daebcb48e7a3ae37672..956b606a18cae1bb11322accfa174ae5ce1580de 100644
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
@@ -1 +1,2 @@
 enable_virtualenv.c
+PythonUtil.cpp
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 45240b5002aa18be4a9b7e3ec3b754eb83ca0e09..10d906ee16656a808122b81d8b2fef55b8e7b7e9 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,5 +1,7 @@
 # The utilities for paddle
 
+configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp)
+
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(enable_virtualenv.py enable_virtualenv.c)
diff --git a/paddle/utils/common.h b/paddle/utils/Common.h
similarity index 97%
rename from paddle/utils/common.h
rename to paddle/utils/Common.h
index 202a9d980d8350c230daaf473dd34d4069479e5f..1f1d0255a5eaef824171ddeaf9480167f232007e 100644
--- a/paddle/utils/common.h
+++ b/paddle/utils/Common.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "Excepts.h"
+
 /**
  * Disable copy macro.
  */
diff --git a/paddle/utils/Compiler.h b/paddle/utils/Compiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..cebca5a2a3766110b83231eb0705e48800a7bda6
--- /dev/null
+++ b/paddle/utils/Compiler.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+/**
+ * This header defines some useful attribute by each compiler. It is the
+ * abstract layer of compilers.
+ */
+#ifdef __GNUC__
+#define GCC_VERSION \
+  (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#else
+#define GCC_VERSION
+#endif
+
+/**
+ * __must_check macro. It make the function's return value must be used,
+ * otherwise it will raise a compile warning. And also Paddle treat all compile
+ * warnings as errors.
+ */
+#if GCC_VERSION >= 30400
+#define __must_check __attribute__((warn_unused_result))
+#else
+#define __must_check
+#endif
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 1218e8194c4e837ca880744f92e769a68ba474de..0f3985cc7b2c018ede9bba9644d2d096561dccee 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "common.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
new file mode 100644
index 0000000000000000000000000000000000000000..cda1b5c37dada8d0c6c77fc2fb03bb614d5301b5
--- /dev/null
+++ b/paddle/utils/Error.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <glog/logging.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include "Compiler.h"
+
+namespace paddle {
+
+/**
+ * Error is Paddle error code. It only contain a std::string as error message.
+ *
+ *
+ * There are two styles to return error in Paddle.
+ *
+ * 1. Return Error
+ *    When method return a status, the return must use `__must_check` attribute.
+ *    Example as below.
+ * @code{cpp}
+ * Error __must_check foo();
+ *
+ * Error __must_check bar() {
+ *   // do something.
+ *   Error err = foo();  // invoke other method return status.
+ *   if (err) return err;
+ *   // do something else.
+ *   return Error();
+ * }
+ * @endcode{cpp}
+ *
+ * 2. Return by parameter.
+ *    It is another way to return an error, by using a pointer parameter.
+ *    Example as below.
+ *
+ * @code{cpp}
+ * Error bar();
+ *
+ * int foo(Error* error) {
+ *   // Do something.
+ *   Error err = bar();
+ *   if (err) {
+ *     *error = s;
+ *     return 0;
+ *   }
+ *   // Do something else.
+ *   if (someInternalErrorHappend) {
+ *     *error = Error("Some dimension is too large, %d", dimension);
+ *     return 0;
+ *   }
+ *   // End of method.
+ *   return someValue;
+ * }
+ *
+ * Error foobar() {
+ *   Error err;
+ *   // do something.
+ *   foo(&err);
+ *   if (err) return err;
+ * }
+ * @endcode{cpp}
+ *
+ *
+ * Currently there is a helper method 'check' in status, because Paddle always
+ * use log(FATAL) or CHECK to make program exit before. When we clean all
+ * log(FATAL) and CHECK in Paddle, 'check' method will be removed.
+ */
+class Error {
+public:
+  /**
+   * Construct a no-error value.
+   */
+  Error() {}
+
+  /**
+   * @brief Create an Error use printf syntax.
+   */
+  explicit Error(const char* fmt, ...) {
+    va_list ap;
+    va_start(ap, fmt);
+    constexpr size_t kBufferSize = 1024;
+    char buffer[kBufferSize];
+    vsnprintf(buffer, kBufferSize, fmt, ap);
+    this->msg_.reset(new std::string(buffer));
+    va_end(ap);
+  }
+
+  /**
+   * @brief msg will return the error message. If no error, return nullptr.
+   */
+  const char* msg() const {
+    if (msg_) {
+      return msg_->c_str();
+    } else {
+      return nullptr;
+    }
+  }
+
+  /**
+   * @brief operator bool, return True if there is something error.
+   */
+  operator bool() const { return !this->isOK(); }
+
+  /**
+   * @brief isOK return True if there is no error.
+   * @return True if no error.
+   */
+  bool isOK() const { return msg_ == nullptr; }
+
+  /**
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
+   */
+  void check() const { CHECK(this->isOK()) << msg(); }
+
+private:
+  std::shared_ptr<std::string> msg_;
+};
+
+}  // namespace paddle
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index dc3369b7e8c27cf53a03ce56b18a123f291d2d6d..5c2c504f53a586f2991ccfae891991465fdb39b6 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef EXCEPTS_H_
 #define EXCEPTS_H_
 
+#include <fenv.h>
+
 #if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 59d6cbdc513660b87cb013d8aa92c5c8f9289ecb..e8f31bc811ac30d83e8203b784ee1f93a8d35d90 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -33,12 +33,15 @@ DEFINE_int32(port, 20134, "Listening port for pserver");
 DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
 DEFINE_int32(ports_num,
              1,
-             "The ports number for parameter send,"
-             " increment based on default port number");
+             "Number of ports for sending dense parameter,"
+             " following ports on parameter server will be visited"
+             " for sending dense parameter: [port, port+ports_num-1]");
 DEFINE_int32(ports_num_for_sparse,
              0,
-             "The ports number for parameter send,"
-             " increment based on default (port + ports_num)");
+             "Number of ports for sending sparse parameter,"
+             " following ports on parameter server will be visited"
+             " for sending sparse parameter:"
+             " [port+ports_num, port+ports_num+ports_num_for_sparse-1]");
 DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
 DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
 DEFINE_int32(trainer_id,
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index a21872e89ebc172b87c8b5c3731a89302f34f521..e87abb9139f1c3f250f8b8fe1afdd8883f682647 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
-#include "common.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp.in
similarity index 98%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/utils/PythonUtil.cpp.in
index 7faeff55c28b9065179ad27b3b604a9f411249e5..66b5795e29fb9fa751ed802e87ced0a71aea4c51 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -195,6 +195,10 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
+  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  if (strlen(pyHome)) {
+    Py_SetPythonHome(pyHome);
+  }
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index d27dae33fd039bbefdbc65908e5ce7dc58eceab7..58fe51bd40c36088fdc6ee51e22d120b63486bf4 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -37,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
       p = new unsigned int(defaultSeed_ - 1);
     } else {
       p = new unsigned int(defaultSeed_ + getTID());
-      LOG(INFO) << "thread use undeterministic rand seed:" << *p;
+      VLOG(3) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
   }
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 0f778dbebf4e124c7a240d738b8f73cef03fc477..dbab4ec43ca2fa691445131d2cb14f51721a2e4c 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -125,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
 
 void runInitFunctions() {
   std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
+    VLOG(3) << "Calling runInitFunctions";
     if (g_initFuncs) {
       std::sort(g_initFuncs->begin(),
                 g_initFuncs->end(),
@@ -139,25 +139,25 @@ void runInitFunctions() {
       g_initFuncs = nullptr;
     }
     g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
+    VLOG(3) << "Call runInitFunctions done.";
   });
 }
 
 void initMain(int argc, char** argv) {
-  initializeLogging(argc, argv);
   installLayerStackTracer();
   std::string line;
   for (int i = 0; i < argc; ++i) {
     line += argv[i];
     line += ' ';
   }
-  LOG(INFO) << "commandline: " << line;
 
 #ifndef GFLAGS_GFLAGS_H_
   namespace gflags = google;
 #endif
 
   gflags::ParseCommandLineFlags(&argc, &argv, true);
+  initializeLogging(argc, argv);
+  LOG(INFO) << "commandline: " << line;
   CHECK_EQ(argc, 1) << "Unknown commandline argument: " << argv[1];
 
   installProfilerSwitch();
@@ -231,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
 }  // namespace path
 
 void copyFileToPath(const std::string& file, const std::string& dir) {
-  LOG(INFO) << "copy " << file << " to " << dir;
+  VLOG(3) << "copy " << file << " to " << dir;
   std::string fileName = path::basename(file);
   std::string dst = path::join(dir, fileName);
   std::ifstream source(file, std::ios_base::binary);
@@ -289,6 +289,7 @@ void mkDir(const char* filename) {
 void mkDirRecursively(const char* dir) {
   struct stat sb;
 
+  if (*dir == 0) return;  // empty string
   if (!stat(dir, &sb)) return;
 
   mkDirRecursively(path::dirname(dir).c_str());
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index dc15ada5862d648af27aa1b0e8c8a5cce012ded8..613844669d2495ada7b8f7a841f47b821b7fdeba 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "Common.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
-#include "common.h"
 
 #include "Flags.h"
 #include "hl_gpu.h"
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index aa5df3243893145dbcc7e7ef2592555fc1c88fc9..f53d6420bbbdf66f8f355af95c6b11c30a3bfab9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 #include <iostream>
-#include "common.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
similarity index 97%
rename from paddle/utils/Excepts.cpp
rename to paddle/utils/arch/osx/Excepts.cpp
index 4ddce35ed31a8fed3f25cb3b03348b4eda8fcfdd..c8e904d8f9fe29e51447994af43dc62bf3514306 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Excepts.h"
+#include "paddle/utils/Excepts.h"
 
 #if defined(__APPLE__) || defined(__OSX__)
 
-#include <fenv.h>
-
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 26fafbd1ab3f2967b765b8bcb973fb745c0e6422..aa923b355377752f9b297a125f5c43c364ba9b06 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -4,6 +4,7 @@ add_simple_unittest(test_CustomStackTrace)
 add_simple_unittest(test_ThreadBarrier)
 add_simple_unittest(test_SpinLock)
 add_simple_unittest(test_SIMDFlags)
+add_simple_unittest(test_Error)
 
 add_executable(
     test_CustomStackTracePrint
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 18dd0aac4305006745dcd8e0a0717fb0fb939778..378788bcecd579fff1c762702a8c27f54cee94bf 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -96,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
     }
   });
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fdf326b17a1c8baa87e2a17fafae253565d1e699
--- /dev/null
+++ b/paddle/utils/tests/test_Error.cpp
@@ -0,0 +1,34 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/utils/Error.h"
+
+#include <gtest/gtest.h>
+
+TEST(Error, testAll) {
+  paddle::Error error;
+  ASSERT_FALSE(error);
+  error = paddle::Error("I'm the error");
+  ASSERT_TRUE(error);
+  ASSERT_STREQ("I'm the error", error.msg());
+
+  error = paddle::Error("error2");
+  ASSERT_TRUE(error);
+  ASSERT_STREQ("error2", error.msg());
+
+  int i = 3;
+  auto error3 = paddle::Error("error%d", i);
+  ASSERT_TRUE(error3);
+  ASSERT_STREQ("error3", error3.msg());
+}
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 42edede209ad957c13c1cec8e6bb20bd0fe9d28b..8200a24ce7b7df75b48a89fbb7af15f304c5957f 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 605bedb6c912b0436f40e3eff93d5cf95d8dc489..cc34eb1f868003d3db9221578c0c20c44be285eb 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -53,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
         });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index 2f5c5bbce07f39b799b928fd231bb4db1d2b3e05..6e2580c4913f0adc7ba1e63c9cebce308775aac6 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
     ASSERT_EQ(res[i], i);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 1237f1b731b2fb733d6823619df2c574476b89de..554b1c1d4adce7a0196b304281dcf878a0b6426e 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -64,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
                    });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 2c40070eca44d8656d7ce82157a1b840092b9965..62d5b9e38b21ee82d1e78c3bde5aa5df7e4a33ee 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -4,7 +4,8 @@ set(proto_filenames
     ModelConfig.proto
     ParameterConfig.proto
     ParameterService.proto
-    TrainerConfig.proto)
+    TrainerConfig.proto
+    ParameterServerConfig.proto)
 
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
@@ -18,10 +19,10 @@ foreach(filename ${proto_filenames})
         ${PROTO_GEN}
         ${CUR_PROTO_GEN})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
                   --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-		  --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
@@ -29,9 +30,9 @@ foreach(filename ${proto_filenames})
         ${CUR_PROTO_GEN_PY}
         ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-	--proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
+    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 3a9d339976fff91d79e7459ad5984cf78ea8990a..65d5d50277b665e7c355202d6e8043f656ae92f1 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -255,6 +255,13 @@ message PriorBoxConfig {
   repeated float variance = 4;
 }
 
+message PadConfig {
+  required ImageConfig image_conf = 1;
+  repeated uint32 pad_c = 2;
+  repeated uint32 pad_h = 3;
+  repeated uint32 pad_w = 4;
+}
+
 message LayerInputConfig {
   required string input_layer_name = 1;
   optional string input_parameter_name = 2;
@@ -271,6 +278,7 @@ message LayerInputConfig {
   optional MaxOutConfig maxout_conf = 11;
   optional SppConfig spp_conf = 12;
   optional PriorBoxConfig priorbox_conf = 13;
+  optional PadConfig pad_conf = 14;
 }
 
 message LayerConfig {
@@ -419,14 +427,14 @@ message LayerConfig {
   // bias size
   optional uint32 bias_size = 48 [default = 0];
 
-  // this parameter can be used as a user-defined parameter when necessary, 
+  // this parameter can be used as a user-defined parameter when necessary,
   // without changing the proto file.
-  // e.g., when a new layer with a user-defined parameter is implemented, 
+  // e.g., when a new layer with a user-defined parameter is implemented,
   // it can be used to pass that parameter, without modifying the proto file.
   // string type is used for flexibility: different types can be converted
-  // to string and reinterpreted in the user's own layer implementation.  
+  // to string and reinterpreted in the user's own layer implementation.
   optional string user_arg = 49;
-  
+
   // to indicate rectangle image data
   optional uint64 height = 50;
   optional uint64 width = 51;
@@ -467,6 +475,10 @@ message EvaluatorConfig {
   // Used by ChunkEvaluator
   // chunk of these types are not counted
   repeated int32 excluded_chunk_types = 12;
+
+  // Used by ClassificationErrorEvaluator
+  // top # classification error
+  optional int32 top_k = 13 [default = 1];
 }
 
 message LinkConfig {
diff --git a/proto/ParameterServerConfig.proto b/proto/ParameterServerConfig.proto
new file mode 100644
index 0000000000000000000000000000000000000000..404f9613792653dda72eeb98f022851adedbfbfd
--- /dev/null
+++ b/proto/ParameterServerConfig.proto
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+syntax = "proto2";
+
+package paddle;
+
+
+/**
+ * Configuration structure for ParameterClient2.
+ */
+message ParameterClientConfig {
+  required int32 trainer_id = 1;
+}
+
+/**
+ * Configuration structure for ParameterServer2.
+ */
+message ParameterServerConfig {
+  // Number of ports for sending dense parameter,
+  // following ports on parameter server will be visited
+  // for sending dense parameter: [port, port+ports_num-1]
+  required int32 ports_num = 1 [default = 1];
+  // Number of ports for sending sparse parameter,
+  // following ports on parameter server will be visited
+  // for sending sparse parameter:
+  // [port+ports_num, port+ports_num+ports_num_for_sparse-1]
+  required int32 ports_num_for_sparse = 2 [default = 0];
+  // network device name for pservers
+  required string nics = 3 [default = "xgbe0,xgbe1"];
+  required string rdma_tcp = 4 [default = "tcp"];
+  // Listening port for pserver
+  required int32 port = 5 [default = 20134];
+  // number of gradient servers
+  required int32 num_gradient_servers = 6 [default = 1];
+  // number of threads for sync op exec
+  required int32 pserver_num_threads = 7 [default = 1];
+  // control config_.async_lagged_grad_discard_ratio() min value
+  required double async_lagged_ratio_min = 8 [default = 1.0];
+  // if async_lagged_grad_discard_ratio is not set in trainer_config.conf
+  // use it as defalut value
+  required double async_lagged_ratio_default = 9 [default = 1.5];
+}
\ No newline at end of file
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index dce0b909524369926eda54763e571706b79daeaf..48e0a1993d07f801e65dfa54a991995c593fe475 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,33 +4,28 @@ set(OUTPUT_DIR
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
+file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 
 set(PY_FILES paddle/__init__.py
              ${TRAINER_PY_FILES}
              ${HELPERS_PY_FILES}
-             ${UTILS_PY_FILES})
-
-set(PADDLE_INTERNAL_PACKAGE "")
-if (PADDLE_WITH_INTERNAL)
-    set(PADDLE_INTERNAL_PACKAGE "paddle.internals")
-endif()
+             ${UTILS_PY_FILES}
+             ${V2_PY_FILES})
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
-find_python_module(pip REQUIRED)
-find_python_module(wheel REQUIRED)
-find_python_module(google.protobuf REQUIRED)
-
 add_subdirectory(paddle/trainer_config_helpers/tests)
+add_subdirectory(paddle/v2/reader/tests)
+add_subdirectory(paddle/v2/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index bd24c68b6fe88eab03c814f8cac70db3880316f4..0e752c117c1ecfab72e2da2f830380e9524236e7 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -45,6 +45,23 @@ class CacheType(object):
 
 
 class InputType(object):
+    """
+    InputType is the base class for paddle input types.
+
+    ..  note::
+
+        this is a base class, and should never be used by user.
+
+    :param dim: dimension of input. If the input is an integer, it means the
+                value range. Otherwise, it means the size of layer.
+    :type dim: int
+    :param seq_type: sequence type of input. 0 means it is not a sequence. 1
+                     means it is a variable length sequence. 2 means it is a
+                     nested sequence.
+    :type seq_type: int
+    :param type: data type of input.
+    :type type: int
+    """
     __slots__ = ['dim', 'seq_type', 'type']
 
     def __init__(self, dim, seq_type, tp):
@@ -54,19 +71,63 @@ class InputType(object):
 
 
 def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Dense Vector. It means the input feature is dense float vector. For example,
+    if the input is an image with 28*28 pixels, the input of Paddle neural
+    network should be a dense vector with dimension 784.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.Dense)
 
 
 def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Sparse binary vector. It means the input feature is a sparse vector and the
+    every element in this vector is either zero or one.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.SparseNonValue)
 
 
 def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Sparse vector. It means the input feature is a sparse vector. Most of the
+    elements in this vector are zero, others could be any float value.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.SparseValue)
 
 
-def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    return InputType(dim, seq_type, DataType.Index)
+def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Data type of integer.
+
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :param value_range: range of this integer.
+    :type value_range: int
+    :return: An input type object
+    :rtype: InputType
+    """
+    return InputType(value_range, seq_type, DataType.Index)
 
 
 dense_vector = dense_slot
@@ -76,6 +137,14 @@ integer_value = index_slot
 
 
 def dense_vector_sequence(dim):
+    """
+    Data type of a sequence of dense vector.
+
+    :param dim: dimension of dense vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -84,6 +153,15 @@ def dense_vector_sub_sequence(dim):
 
 
 def sparse_binary_vector_sequence(dim):
+    """
+    Data type of a sequence of sparse vector, which every element is either zero
+     or one.
+
+    :param dim: dimension of sparse vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -92,6 +170,15 @@ def sparse_binary_vector_sub_sequence(dim):
 
 
 def sparse_vector_sequence(dim):
+    """
+    Data type of a sequence of sparse vector, which most elements are zero,
+    others could be any float value.
+
+    :param dim: dimension of sparse vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -99,8 +186,14 @@ def sparse_vector_sub_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def integer_value_sequence(dim):
-    return integer_value(dim, seq_type=SequenceType.SEQUENCE)
+def integer_value_sequence(value_range):
+    """
+    Data type of a sequence of integer.
+
+    :param value_range: range of each element.
+    :type value_range: int
+    """
+    return integer_value(value_range, seq_type=SequenceType.SEQUENCE)
 
 
 def integer_value_sub_sequence(dim):
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b88853ea004ddefd453eab9d82da58b6259e5543..04ea135c1c2e5dbb023cf7bf15e21ace2ed55504 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -493,6 +493,7 @@ class Input(Cfg):
             block_expand=None,
             maxout=None,
             spp=None,
+            pad=None,
             format=None,
             nnz=None,
             is_static=None,
@@ -829,7 +830,6 @@ class Pool(Cfg):
             channels,
             size_x,
             size_y=None,
-            img_width=None,
             start=None,
             stride=None,  # 1 by defalut in protobuf
             stride_y=None,
@@ -844,6 +844,12 @@ class SpatialPyramidPool(Cfg):
         self.add_keys(locals())
 
 
+@config_class
+class Pad(Cfg):
+    def __init__(self, channels, pad_c, pad_h, pad_w):
+        self.add_keys(locals())
+
+
 @config_class
 class Norm(Cfg):
     def __init__(self,
@@ -887,11 +893,11 @@ class MaxOut(Cfg):
         self.add_keys(locals())
 
 
-def DataBase(async_load_data=False,
-             constant_slots=None,
-             data_ratio=1,
-             is_main_data=True,
-             usage_ratio=None):
+def create_data_config_proto(async_load_data=False,
+                             constant_slots=None,
+                             data_ratio=1,
+                             is_main_data=True,
+                             usage_ratio=None):
     # default: all sub dataproviders are treat as "main data".
     # see proto/DataConfig.proto for is_main_data
     data_config = DataConfig()
@@ -917,7 +923,7 @@ def SimpleData(files=None,
                context_len=None,
                buffer_capacity=None,
                **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = 'simple'
     data_config.files = files
     data_config.feat_dim = feat_dim
@@ -939,7 +945,7 @@ def PyData(files=None,
            constant_slots=None,
            load_thread_num=None,
            **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = 'py'
     if load_data_module in g_py_module_name_list:
 
@@ -990,7 +996,7 @@ def ProtoData(files=None,
               constant_slots=None,
               load_thread_num=None,
               **xargs):
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     if type is None:
         data_config.type = 'proto'
     else:
@@ -1029,7 +1035,7 @@ def Data(type,
          buffer_capacity=None,
          **xargs):
 
-    data_config = DataBase(**xargs)
+    data_config = create_data_config_proto(**xargs)
     data_config.type = type
     data_config.files = files
     data_config.feat_dim = feat_dim
@@ -1102,7 +1108,7 @@ def parse_bilinear(bilinear, input_layer_name, bilinear_conf):
     bilinear_conf.out_size_y = bilinear.out_size_y
 
 
-def parse_pool(pool, input_layer_name, pool_conf):
+def parse_pool(pool, input_layer_name, pool_conf, ceil_mode):
     pool_conf.pool_type = pool.pool_type
     config_assert(pool.pool_type in [
         'max-projection', 'avg-projection', 'cudnn-max-pool', 'cudnn-avg-pool'
@@ -1127,10 +1133,10 @@ def parse_pool(pool, input_layer_name, pool_conf):
     pool_conf.padding_y = default(pool.padding_y, pool_conf.padding)
     pool_conf.output_x = cnn_output_size(pool_conf.img_size, pool_conf.size_x,
                                          pool_conf.padding, pool_conf.stride,
-                                         False)
+                                         not ceil_mode)
     pool_conf.output_y = cnn_output_size(pool_conf.img_size_y, pool_conf.size_y,
                                          pool_conf.padding_y,
-                                         pool_conf.stride_y, False)
+                                         pool_conf.stride_y, not ceil_mode)
 
 
 def parse_spp(spp, input_layer_name, spp_conf):
@@ -1247,6 +1253,7 @@ def Evaluator(
         dict_file=None,
         result_file=None,
         num_results=None,
+        top_k=None,
         delimited=None,
         excluded_chunk_types=None, ):
     evaluator = g_config.model_config.evaluators.add()
@@ -1274,6 +1281,8 @@ def Evaluator(
         evaluator.result_file = result_file
     if num_results is not None:
         evaluator.num_results = num_results
+    if top_k is not None:
+        evaluator.top_k = top_k
     if delimited is not None:
         evaluator.delimited = delimited
 
@@ -1816,22 +1825,22 @@ class NormLayer(LayerBase):
 
 @config_layer('pool')
 class PoolLayer(LayerBase):
-    def __init__(self, name, inputs, **xargs):
+    def __init__(self, name, inputs, ceil_mode=True, **xargs):
         super(PoolLayer, self).__init__(name, 'pool', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             pool_conf = self.config.inputs[input_index].pool_conf
             parse_pool(self.inputs[input_index].pool, input_layer.name,
-                       pool_conf)
+                       pool_conf, ceil_mode)
             self.set_cnn_layer(name, pool_conf.output_y, pool_conf.output_x,
                                pool_conf.channels)
 
 
 @config_layer('spp')
 class SpatialPyramidPoolLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(SpatialPyramidPoolLayer, self).__init__(
-            name, 'spp', 0, inputs=inputs, device=device)
+            name, 'spp', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             spp_conf = self.config.inputs[input_index].spp_conf
@@ -1840,6 +1849,25 @@ class SpatialPyramidPoolLayer(LayerBase):
             self.set_cnn_layer(name, 1, output_x, spp_conf.image_conf.channels)
 
 
+@config_layer('pad')
+class PadLayer(LayerBase):
+    def __init__(self, name, inputs, **xargs):
+        super(PadLayer, self).__init__(name, 'pad', 0, inputs=inputs, **xargs)
+        pad = self.inputs[0].pad
+        self.config.inputs[0].pad_conf.pad_c.extend(pad.pad_c)
+        self.config.inputs[0].pad_conf.pad_h.extend(pad.pad_h)
+        self.config.inputs[0].pad_conf.pad_w.extend(pad.pad_w)
+
+        input_layer = self.get_input_layer(0)
+        image_conf = self.config.inputs[0].pad_conf.image_conf
+        parse_image(pad, input_layer.name, image_conf)
+        out_ch = pad.channels + pad.pad_c[0] + pad.pad_c[1]
+        out_h = image_conf.img_size_y + pad.pad_h[0] + pad.pad_h[1]
+        out_w = image_conf.img_size + pad.pad_w[0] + pad.pad_w[1]
+        self.set_cnn_layer(name, out_h, out_w, out_ch)
+        self.config.size = out_ch * out_h * out_w
+
+
 @config_layer('batch_norm')
 class BatchNormLayer(LayerBase):
     layer_type = 'batch_norm'
@@ -1901,8 +1929,8 @@ class BatchNormLayer(LayerBase):
         image_conf = self.config.inputs[0].image_conf
         parse_image(self.inputs[0].image, input_layer.name, image_conf)
 
-        # Only pass the width and height of input to batch_norm layer 
-        # when either of it is non-zero. 
+        # Only pass the width and height of input to batch_norm layer
+        # when either of it is non-zero.
         if input_layer.width != 0 or input_layer.height != 0:
             self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
                                image_conf.channels, False)
@@ -1942,11 +1970,23 @@ class ResizeLayer(LayerBase):
             'ResizeLayer must have one and only one input')
 
 
+@config_layer('rotate')
+class RotateLayer(LayerBase):
+    def __init__(self, name, inputs, height, width, device=None):
+        super(RotateLayer, self).__init__(
+            name, 'rotate', 0, inputs=inputs, device=device)
+        config_assert(
+            len(self.inputs) == 1,
+            'RotateLayer must have one and only one input')
+        self.set_layer_height_width(height, width)
+        self.set_layer_size(self.get_input_layer(0).size)
+
+
 @config_layer('blockexpand')
 class BlockExpandLayer(LayerBase):
-    def __init__(self, name, inputs, device=None):
+    def __init__(self, name, inputs, **xargs):
         super(BlockExpandLayer, self).__init__(
-            name, 'blockexpand', 0, inputs=inputs, device=device)
+            name, 'blockexpand', 0, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             parse_block_expand(
@@ -2598,7 +2638,7 @@ class AverageLayer(LayerBase):
 
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=5, device=None):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
diff --git a/python/paddle/trainer/recurrent_units.py b/python/paddle/trainer/recurrent_units.py
old mode 100644
new mode 100755
index edca279dcadef42243cb3fc00366cec90cbc69bf..ef92107a1093d2ec2b2a41677e964fdaa60ac829
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
@@ -15,10 +15,13 @@
 # recurrent_units.py
 # Version 2.0
 #
-# Some recurrent units can be used in recurrent layer group, 
+# Some recurrent units can be used in recurrent layer group,
 #   to use these units, import this module in your config_file:
-#     import trainer.recurrent_units 
-# 
+#     import trainer.recurrent_units
+#
+# The modules in this file are DEPRECATED.
+# If you would like to use lstm/gru
+# please use the functions defined in paddle.trainer_config_helpers.
 
 from paddle.trainer.config_parser import *
 
@@ -26,7 +29,7 @@ from paddle.trainer.config_parser import *
 # long short term memory, can be used in recurrent machine
 # *inputs* must be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two LstmRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 def LstmRecurrentUnit(name,
@@ -194,7 +197,7 @@ def LstmRecurrentLayerGroup(name,
 # gated recurrent unit, can be used in recurrent machine
 # *inputs* should be a list of Projections, for example:
 #   inputs = [FullMatrixProjection("input_layer_name")],
-# *para_prefix* defines parameter names, if the *para_prefix* of 
+# *para_prefix* defines parameter names, if the *para_prefix* of
 #   two GatedRecurrentUnit is same, they share same parameters
 # *out_memory* can be defined outside if it's used outside
 
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index 622b4fc25ccff397cd3115db316870f328466fba..ab9a2562dcccb394c0b24741ceeb10061e40cb9a 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -58,8 +58,8 @@ def define_py_data_source(file_list,
     :param obj: python object name. May be a function name if using
                 PyDataProviderWrapper.
     :type obj: basestring
-    :param args: The best practice is using dict to pass arguments into 
-                 DataProvider, and use :code:`@init_hook_wrapper` to 
+    :param args: The best practice is using dict to pass arguments into
+                 DataProvider, and use :code:`@init_hook_wrapper` to
                  receive arguments.
     :type args: string or picklable object
     :param async: Load Data asynchronously or not.
@@ -98,7 +98,7 @@ def define_py_data_sources(train_list,
     The annotation is almost the same as define_py_data_sources2, except that
     it can specific train_async and data_cls.
 
-    :param data_cls: 
+    :param data_cls:
     :param train_list: Train list name.
     :type train_list: basestring
     :param test_list: Test list name.
@@ -111,8 +111,8 @@ def define_py_data_sources(train_list,
                 a tuple or list to this argument.
     :type obj: basestring or tuple or list
     :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
-                 arguments. If train and test is different, then pass a tuple 
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive
+                 arguments. If train and test is different, then pass a tuple
                  or list to this argument.
     :type args: string or picklable object or list or tuple.
     :param train_async: Is training data load asynchronously or not.
@@ -163,12 +163,12 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
 
     ..  code-block:: python
 
-        define_py_data_sources2(train_list="train.list", 
-                                test_list="test.list", 
+        define_py_data_sources2(train_list="train.list",
+                                test_list="test.list",
                                 module="data_provider"
                                 # if train/test use different configurations,
                                 # obj=["process_train", "process_test"]
-                                obj="process", 
+                                obj="process",
                                 args={"dictionary": dict_name})
 
     The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
@@ -185,8 +185,8 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
                 a tuple or list to this argument.
     :type obj: basestring or tuple or list
     :param args: The best practice is using dict() to pass arguments into
-                 DataProvider, and use :code:`@init_hook_wrapper` to receive 
-                 arguments. If train and test is different, then pass a tuple 
+                 DataProvider, and use :code:`@init_hook_wrapper` to receive
+                 arguments. If train and test is different, then pass a tuple
                  or list to this argument.
     :type args: string or picklable object or list or tuple.
     :return: None
@@ -195,13 +195,13 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
 
     def py_data2(files, load_data_module, load_data_object, load_data_args,
                  **kwargs):
-        data = DataBase()
+        data = create_data_config_proto()
         data.type = 'py2'
         data.files = files
         data.load_data_module = load_data_module
         data.load_data_object = load_data_object
         data.load_data_args = load_data_args
-        data.async_load_data = True
+        data.async_load_data = False
         return data
 
     define_py_data_sources(
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index ad3efcbf369411b9c42b2a32ed05b04f86bf7de6..2f25579fcdd9793e4c165439c9934a2bccb63617 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -52,6 +52,10 @@ def wrap_param_default(param_names=None,
                     kwargs[name] = default_factory(func)
             return func(*args, **kwargs)
 
+        if hasattr(func, 'argspec'):
+            __wrapper__.argspec = func.argspec
+        else:
+            __wrapper__.argspec = inspect.getargspec(func)
         return __wrapper__
 
     return __impl__
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index bd247ea9af9d8dfb2d476cdc62638bd65c11add5..567521ee9dbadb7a2502cfb9972ef0940e1e410a 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -71,6 +71,7 @@ def evaluator_base(
         result_file=None,
         num_results=None,
         delimited=None,
+        top_k=None,
         excluded_chunk_types=None, ):
     """
     Evaluator will evaluate the network status while training/testing.
@@ -104,12 +105,15 @@ def evaluator_base(
     :param weight: An input layer which is a weight for each sample.
                    Each evaluator may calculate differently to use this weight.
     :type weight: LayerOutput.
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
         classification_threshold, float)
     assert positive_label is None or isinstance(positive_label, int)
     assert num_results is None or isinstance(num_results, int)
+    assert top_k is None or isinstance(top_k, int)
 
     if not isinstance(input, list):
         input = [input]
@@ -130,6 +134,8 @@ def evaluator_base(
         dict_file=dict_file,
         result_file=result_file,
         delimited=delimited,
+        num_results=num_results,
+        top_k=top_k,
         excluded_chunk_types=excluded_chunk_types, )
 
 
@@ -139,6 +145,7 @@ def classification_error_evaluator(input,
                                    label,
                                    name=None,
                                    weight=None,
+                                   top_k=None,
                                    threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
@@ -167,6 +174,8 @@ def classification_error_evaluator(input,
                   then means not set weight. The larger weight it is, the more
                   important this sample is.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param threshold: The classification threshold.
     :type threshold: float
     :return: None.
@@ -178,6 +187,7 @@ def classification_error_evaluator(input,
         input=input,
         label=label,
         weight=weight,
+        top_k=top_k,
         classification_threshold=threshold, )
 
 
diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py
index 2d9e36f2b0d379d907634208a45c69efa9dbba3d..544b443825393c9a31c0375724d4ca63dac5c5eb 100644
--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ b/python/paddle/trainer_config_helpers/layer_math.py
@@ -39,6 +39,7 @@ register_unary_math_op('abs', act.AbsActivation())
 register_unary_math_op('sigmoid', act.SigmoidActivation())
 register_unary_math_op('tanh', act.TanhActivation())
 register_unary_math_op('square', act.SquareActivation())
+register_unary_math_op('relu', act.ReluActivation())
 
 
 def add(layeroutput, other):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
old mode 100644
new mode 100755
index 9b6e5774bc82dc05e14a2565fa9cce98764adf04..b94f8f9a783552519ca73e7cfc0937b302d3445b
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -14,6 +14,7 @@
 
 import functools
 import collections
+import inspect
 
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
@@ -37,6 +38,7 @@ __all__ = [
     "dotmul_projection",
     "dotmul_operator",
     "repeat_layer",
+    "seq_reshape_layer",
     "table_projection",
     "mixed_layer",
     "data_layer",
@@ -59,6 +61,7 @@ __all__ = [
     'img_cmrnorm_layer',
     'addto_layer',
     'concat_layer',
+    'seq_concat_layer',
     'lstm_step_layer',
     'recurrent_group',
     'memory',
@@ -70,6 +73,7 @@ __all__ = [
     'interpolation_layer',
     'bilinear_interp_layer',
     'trans_layer',
+    'rotate_layer',
     'sum_to_one_norm_layer',
     'get_output_layer',
     'LayerType',
@@ -108,6 +112,9 @@ __all__ = [
     'print_layer',
     'priorbox_layer',
     'spp_layer',
+    'pad_layer',
+    'eos_layer',
+    'layer_support',
 ]
 
 
@@ -122,6 +129,7 @@ class LayerType(object):
     GRUMEMORY = "gated_recurrent"
     SEQUENCE_LAST_INSTANCE = "seqlastins"
     SEQUENCE_FIRST_INSTANCE = "seqfirstins"
+    SEQUENCE_RESHAPE = "seqreshape"
     POOLING_MAX = "max"
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
@@ -142,6 +150,7 @@ class LayerType(object):
 
     CONCAT_LAYER = 'concat'
     CONCAT_PROJ_LAYER = 'concat2'
+    SEQUENCE_CONCAT_LAYER = 'seqconcat'
 
     LSTM_STEP_LAYER = 'lstm_step'
     GRU_STEP_LAYER = 'gru_step'
@@ -153,6 +162,7 @@ class LayerType(object):
     POWER_LAYER = 'power'
     SCALING_LAYER = 'scaling'
     TRANS_LAYER = 'trans'
+    ROTATE_LAYER = 'rotate'
     OUT_PROD_LAYER = 'out_prod'
     FEATURE_MAP_EXPAND_LAYER = 'featmap_expand'
 
@@ -170,6 +180,7 @@ class LayerType(object):
     BLOCK_EXPAND = "blockexpand"
     MAXOUT = "maxout"
     SPP_LAYER = "spp"
+    PAD_LAYER = "pad"
 
     PRINT_LAYER = "print"
     PRIORBOX_LAYER = "priorbox"
@@ -306,6 +317,11 @@ def layer_support(*attrs):
                     val.check(method.__name__)
             return method(*args, **kwargs)
 
+        if hasattr(method, 'argspec'):
+            wrapper.argspec = method.argspec
+        else:
+            wrapper.argspec = inspect.getargspec(method)
+
         return wrapper
 
     return decorator
@@ -700,6 +716,7 @@ class MixedLayerType(LayerOutput):
         # update the size which might be computed inside MixedLayer
         # according to the operator's output size
         self.size = ml.config.size
+        self.finalized = True
 
 
 @wrap_name_default("mixed")
@@ -778,17 +795,16 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
     ..  code-block:: python
 
-        data = data_layer(name="input",
-                          size=1000)
+        data = data_layer(name="input", size=1000)
 
     :param name: Name of this data layer.
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
     :param height: Height of this data layer, used for image
-    :type size: int|None
+    :type height: int|None
     :param width: Width of this data layer, used for image
-    :type size: int|None
+    :type width: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -1279,6 +1295,12 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = last_seq(input=layer)
+
     :param agg_level: Aggregated level
     :param name: Layer name.
     :type name: basestring
@@ -1317,6 +1339,12 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = first_seq(input=layer)
+
     :param agg_level: aggregation level
     :param name: Layer name.
     :type name: basestring
@@ -1417,7 +1445,7 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       expand = repeat_layer(layer, 4)
+       expand = repeat_layer(input=layer, num_repeats=4)
 
     :param input: Input layer
     :type input: LayerOutput
@@ -1444,6 +1472,61 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
         parents=[input])
 
 
+@wrap_name_default("seqreshape")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_reshape_layer(input,
+                      reshape_size,
+                      act=None,
+                      name=None,
+                      layer_attr=None,
+                      bias_attr=None):
+    """
+    A layer for reshaping the sequence. Assume the input sequence has T instances,
+    the dimension of each instance is M, and the input reshape_size is N, then the 
+    output sequence has T*M/N instances, the dimension of each instance is N.
+
+    Note that T*M/N must be an integer.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       reshape = seq_reshape_layer(input=layer, reshape_size=4)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param reshape_size: the size of reshaped sequence.
+    :type reshape_size: int
+    :param name: Layer name.
+    :type name: basestring
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    Layer(
+        inputs=[input.name],
+        name=name,
+        size=reshape_size,
+        type=LayerType.SEQUENCE_RESHAPE,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        size=reshape_size,
+        layer_type=LayerType.SEQUENCE_RESHAPE,
+        parents=[input])
+
+
 @wrap_name_default()
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
@@ -1640,7 +1723,7 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 @layer_support()
 def trans_layer(input, name=None, layer_attr=None):
     """
-    A layer for transposition.
+    A layer for transposing a minibatch matrix.
 
     .. math::
        y = x^\mathrm{T}
@@ -1673,7 +1756,53 @@ def trans_layer(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+def rotate_layer(input, height, width, name=None, layer_attr=None):
+    """
+    A layer for rotating 90 degrees (clock-wise) for each feature channel,
+    usually used when the input sample is some image or feature map.
+
+    .. math::
+       y(j,i,:) = x(M-i-1,j,:)
+
+    where :math:`x` is (M x N x C) input, and :math:`y` is (N x M x C) output.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       rot = rotate_layer(input=layer,
+                          height=100,
+                          width=100)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param height: The height of the sample matrix
+    :type height: int
+    :param name: Layer name.
+    :type name: basestring
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    l = Layer(
+        name=name,
+        height=height,
+        width=width,
+        type=LayerType.ROTATE_LAYER,
+        inputs=[input.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.ROTATE_LAYER,
+        parents=[input],
+        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support()
+def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     """
     Cosine Similarity Layer. The cosine similarity equation is here.
 
@@ -1688,6 +1817,12 @@ def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
     Note that the above computation is for one sample. Multiple samples are
     processed in one batch.
 
+    The example usage is:
+
+    .. code-block:: python
+
+       cos = cos_sim(a=layer1, b=layer2, size=3)
+
     :param name: layer name
     :type name: basestring
     :param a: input layer a
@@ -1824,14 +1959,14 @@ def img_conv_layer(input,
                    trans=False,
                    layer_type=None):
     """
-    Convolution layer for image. Paddle can support both square and non-square 
+    Convolution layer for image. Paddle can support both square and non-square
     input currently.
 
     The details of convolution layer, please refer UFLDL's `convolution
     <http://ufldl.stanford.edu/tutorial/supervised/
     FeatureExtractionUsingConvolution/>`_ .
 
-    Convolution Transpose (deconv) layer for image. Paddle can support both square 
+    Convolution Transpose (deconv) layer for image. Paddle can support both square
     and non-square input currently.
 
     The details of convolution transpose layer,
@@ -1849,6 +1984,16 @@ def img_conv_layer(input,
     pieces. First 256/4 = 64 channels will process by first 32 filters. The
     rest channels will be processed by rest group of filters.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
     :param name: Layer name.
     :type name: basestring
     :param input: Layer Input.
@@ -1890,7 +2035,7 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type 
+                       layer_type has to be "exconvt", otherwise layer_type
                        has to be either "exconv" or "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
@@ -1979,7 +2124,8 @@ def img_pool_layer(input,
                    layer_attr=None,
                    pool_size_y=None,
                    stride_y=None,
-                   padding_y=None):
+                   padding_y=None,
+                   ceil_mode=True):
     """
     Image pooling Layer.
 
@@ -1987,6 +2133,34 @@ def img_pool_layer(input,
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
+    - ceil_mode=True:
+
+    ..  math::
+
+        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        maxpool = img_pool_layer(input=conv,
+                                 pool_size=3,
+                                 pool_size_y=5,
+                                 num_channels=8,
+                                 stride=1,
+                                 stride_y=2,
+                                 padding=1,
+                                 padding_y=2,
+                                 pool_type=MaxPooling())
+
     :param padding: pooling padding width.
     :type padding: int
     :param padding_y: pooling padding height. It's equal to padding by default.
@@ -2010,6 +2184,10 @@ def img_pool_layer(input,
     :type stride_y: int|None
     :param layer_attr: Extra Layer attribute.
     :type layer_attr: ExtraLayerAttribute
+    :param ceil_mode: Wether to use ceil mode to calculate output height and with.
+                      Defalut is True. If set false, Otherwise use floor.
+
+    :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
@@ -2047,6 +2225,7 @@ def img_pool_layer(input,
                     stride_y=stride_y,
                     padding_y=padding_y))
         ],
+        ceil_mode=ceil_mode,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
     return LayerOutput(
         name,
@@ -2069,6 +2248,15 @@ def spp_layer(input,
     The details please refer to
     `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        spp = spp_layer(input=data, 
+                        pyramid_height=2, 
+                        num_channels=16, 
+                        pool_type=MaxPooling())
+
     :param name: layer name.
     :type name: basestring
     :param input: layer's input.
@@ -2157,6 +2345,12 @@ def img_cmrnorm_layer(input,
     The details please refer to
     `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+    
+        norm = img_cmrnorm_layer(input=net, size=5)
+
     :param name: layer name.
     :type name: None|basestring
     :param input: layer's input.
@@ -2212,6 +2406,12 @@ def batch_norm_layer(input,
     The details of batch normalization please refer to this
     `paper <http://arxiv.org/abs/1502.03167>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+    
+        norm = batch_norm_layer(input=net, act=ReluActivation())
+
     :param name: layer name.
     :type name: basestring
     :param input: batch normalization input. Better be linear activation.
@@ -2501,6 +2701,63 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
         size=sz)
 
 
+@wrap_name_default("seqconcat")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
+                     bias_attr=None):
+    """
+    Concat sequence a with sequence b.
+
+    Inputs: 
+      - a = [a1, a2, ..., an]
+      - b = [b1, b2, ..., bn]
+      - Note that the length of a and b should be the same.
+        
+    Output: [a1, b1, a2, b2, ..., an, bn]
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        concat = seq_concat_layer(a=layer1, b=layer2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param a: input sequence layer
+    :type a: LayerOutput
+    :param b: input sequence layer
+    :type b: LayerOutput
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    assert a.size == b.size
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_CONCAT_LAYER,
+        inputs=[a.name, b.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(
+        name,
+        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
+        parents=[a, b],
+        activation=act,
+        size=a.size)
+
+
 def memory(name,
            size,
            is_seq=False,
@@ -2661,6 +2918,7 @@ def lstm_step_layer(input,
 
 
 @wrap_bias_attr_default()
+@wrap_param_attr_default()
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('gru_step')
@@ -2672,6 +2930,7 @@ def gru_step_layer(input,
                    name=None,
                    gate_act=None,
                    bias_attr=None,
+                   param_attr=None,
                    layer_attr=None):
     """
 
@@ -2683,6 +2942,8 @@ def gru_step_layer(input,
     :param name:
     :param gate_act:
     :param bias_attr:
+    :param param_attr: the parameter_attribute for transforming the output_mem
+                       from previous step.
     :param layer_attr:
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2693,7 +2954,12 @@ def gru_step_layer(input,
     Layer(
         name=name,
         type=LayerType.GRU_STEP_LAYER,
-        inputs=[input.name, output_mem.name],
+        # The parameter here is for transforming the output_mem. The input has
+        # already been transformed outside this module so it does not need
+        # parameter associated with it.
+        # The parameter here is instead grouped with input is due to
+        # backward model compatibility.
+        inputs=[Input(input.name, **param_attr.attr), output_mem.name],
         bias=ParamAttr.to_bias(bias_attr),
         size=size,
         active_type=act.name,
@@ -3342,6 +3608,7 @@ def classification_cost(input,
                         label,
                         weight=None,
                         name=None,
+                        top_k=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -3356,6 +3623,8 @@ def classification_cost(input,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3383,7 +3652,7 @@ def classification_cost(input,
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label, weight=weight)
+        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -3488,9 +3757,6 @@ def conv_projection(input,
                     groups=1,
                     param_attr=None):
     """
-    ConvProjection with a layer as input.
-    It performs element-wise multiplication with weight.
-
     Different from img_conv_layer and conv_op, conv_projection is an Projection,
     which can be used in mixed_layer and conat_layer. It use cudnn to implement
     conv and only support GPU mode.
@@ -3499,7 +3765,7 @@ def conv_projection(input,
 
     .. code-block:: python
 
-       proj = conv_projection(img=input1,
+       proj = conv_projection(input=input1,
                               filter_size=3,
                               num_filters=64,
                               num_channels=64)
@@ -3582,6 +3848,110 @@ def conv_projection(input,
     return proj
 
 
+@wrap_name_default("pad")
+@layer_support()
+def pad_layer(input,
+              pad_c=None,
+              pad_h=None,
+              pad_w=None,
+              name=None,
+              layer_attr=None):
+    """
+    This operation pads zeros to the input data according to pad_c,pad_h
+    and pad_w. pad_c, pad_h, pad_w specifies the which dimension and size
+    of padding. And the input data shape is NCHW.
+
+    For example, pad_c=[2,3] means padding 2 zeros before the
+    input data and 3 zeros after the input data in channel dimension.
+    pad_h means padding zeros in height dimension. pad_w means padding zeros
+    in width dimension.
+
+    For example,
+
+    .. code-block:: python
+
+       input(2,2,2,3)  = [
+                           [ [[1,2,3], [3,4,5]],
+                             [[2,3,5], [1,6,7]] ],
+                           [ [[4,3,1], [1,8,7]],
+                             [[3,8,9], [2,3,5]] ]
+                         ]
+
+       pad_c=[1,1], pad_h=[0,0], pad_w=[0,0]
+
+       output(2,4,2,3) = [
+                           [ [[0,0,0], [0,0,0]],
+                             [[1,2,3], [3,4,5]],
+                             [[2,3,5], [1,6,7]],
+                             [[0,0,0], [0,0,0]] ],
+                           [ [[0,0,0], [0,0,0]],
+                             [[4,3,1], [1,8,7]],
+                             [[3,8,9], [2,3,5]],
+                             [[0,0,0], [0,0,0]] ]
+                         ]
+
+    The simply usage is:
+
+    .. code-block:: python
+
+       pad = pad_layer(input=ipt,
+                       pad_c=[4,4],
+                       pad_h=[0,0],
+                       pad_w=[2,2])
+
+    :param input: layer's input.
+    :type input: LayerOutput
+    :param pad_c: padding size in channel dimension.
+    :type pad_c: list|None
+    :param pad_h: padding size in height dimension.
+    :type pad_h: list|None
+    :param pad_w: padding size in width dimension.
+    :type pad_w: list|None
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param name: layer name.
+    :type name: basestring
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    if pad_c is not None:
+        assert isinstance(pad_c, collections.Sequence) and len(pad_c) == 2
+    else:
+        pad_c = [0, 0]
+
+    if pad_h is not None:
+        assert isinstance(pad_h, collections.Sequence) and len(pad_h) == 2
+    else:
+        pad_h = [0, 0]
+
+    if pad_w is not None:
+        assert isinstance(pad_w, collections.Sequence) and len(pad_w) == 2
+    else:
+        pad_w = [0, 0]
+
+    assert input.num_filters is not None
+    in_ch = input.num_filters
+    out_ch = in_ch + pad_c[0] + pad_c[1]
+
+    l = Layer(
+        name=name,
+        type=LayerType.PAD_LAYER,
+        inputs=Input(
+            input.name,
+            pad=Pad(
+                channels=in_ch,
+                pad_c=pad_c,
+                pad_h=pad_h,
+                pad_w=pad_w, )),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name,
+        layer_type=LayerType.PAD_LAYER,
+        parents=[input],
+        num_filters=out_ch,
+        size=l.config.size)
+
+
 @wrap_name_default()
 @layer_support()
 def conv_shift_layer(a, b, name=None, layer_attr=None):
@@ -3605,13 +3975,13 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       conv_shift = conv_shift_layer(input=[layer1, layer2])
+       conv_shift = conv_shift_layer(a=layer1, b=layer2)
 
     :param name: layer name
     :type name: basestring
     :param a: Input layer a.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: input layer b.
     :type b: LayerOutput
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3703,8 +4073,8 @@ def tensor_layer(a,
 @wrap_act_default()
 @layer_support()
 def selective_fc_layer(input,
-                       select,
                        size,
+                       select=None,
                        act=None,
                        name=None,
                        pass_generation=False,
@@ -3731,6 +4101,7 @@ def selective_fc_layer(input,
     :type input: LayerOutput|list|tuple
     :param select: The select layer. The output of select layer should be a
                    sparse binary matrix, and treat as the mask of selective fc.
+                   If is None, acts exactly like fc_layer.
     :type select: LayerOutput
     :param size: The layer dimension.
     :type size: int
@@ -3959,7 +4330,7 @@ def block_expand_layer(input,
 
     .. code-block:: python
 
-       block_expand = block_expand_layer(input,
+       block_expand = block_expand_layer(input=layer,
                                          num_channels=128,
                                          stride_x=1,
                                          stride_y=1,
@@ -4013,13 +4384,7 @@ def block_expand_layer(input,
 
 @wrap_name_default()
 @layer_support()
-def maxout_layer(input,
-                 groups,
-                 num_channels=None,
-                 size_x=None,
-                 size_y=None,
-                 name=None,
-                 layer_attr=None):
+def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     """
     A layer to do max out on conv layer output.
       - Input: output of a conv layer.
@@ -4049,12 +4414,6 @@ def maxout_layer(input,
     :type num_channels: int|None
     :param groups: The group number of input layer.
     :type groups: int
-    :param size_x: conv output width. If None will be set
-                   automatically from previous output.
-    :type size_x: int|None
-    :param size_y: conv output height. If None will be set
-                   automatically from previous output.
-    :type size_y: int|None
     :param name: The name of this layer, which can not specify.
     :type name: None|basestring.
     :param layer_attr: Extra Layer attribute.
@@ -4175,7 +4534,7 @@ def warp_ctc_layer(input,
         - You can set 'blank' to any value ranged in [0, num_classes], which
           should be consistent as that used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
-         'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected instead in the 'input' layer.
 
     The simple usage:
 
@@ -4308,6 +4667,13 @@ def crf_decoding_layer(input,
     this layer will also calculate error. output.value[i] is 1 for incorrect
     decoding or 0 for correct decoding.
 
+    The simple usage:
+
+    .. code-block:: python
+
+      crf_decoding = crf_decoding_layer(input=input,
+                                        size=label_dim)
+
     :param input: The first input layer.
     :type input: LayerOutput
     :param size: size of this layer.
@@ -4616,6 +4982,7 @@ def cross_entropy_with_selfnorm(input,
                                 layer_attr=None):
     """
     A loss layer for multi class entropy with selfnorm.
+    Input should be a vector of positive numbers, without normalization.
 
     .. code-block:: python
 
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
old mode 100644
new mode 100755
index 375bea34e8aa0ac2ea222531f313a627414495b0..cadde11ff81658cb309cd1bf7a44bac6374c1e44
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -737,12 +737,12 @@ def lstmemory_group(input,
                     lstm_layer_attr=None,
                     get_output_layer_attr=None):
     """
-    lstm_group is a recurrent layer group version Long Short Term Memory. It
+    lstm_group is a recurrent layer group version of Long Short Term Memory. It
     does exactly the same calculation as the lstmemory layer (see lstmemory in
     layers.py for the maths) does. A promising benefit is that LSTM memory
-    cell states, or hidden states in every time step are accessible to for the
+    cell states, or hidden states in every time step are accessible to the
     user. This is especially useful in attention model. If you do not need to
-    access to the internal states of the lstm, but merely use its outputs,
+    access the internal states of the lstm, but merely use its outputs,
     it is recommended to use the lstmemory, which is relatively faster than
     lstmemory_group.
 
@@ -822,6 +822,7 @@ def gru_unit(input,
              size=None,
              name=None,
              gru_bias_attr=None,
+             gru_param_attr=None,
              act=None,
              gate_act=None,
              gru_layer_attr=None):
@@ -862,6 +863,7 @@ def gru_unit(input,
         output_mem=out_mem,
         size=size,
         bias_attr=gru_bias_attr,
+        param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
         layer_attr=gru_layer_attr)
@@ -874,15 +876,16 @@ def gru_group(input,
               name=None,
               reverse=False,
               gru_bias_attr=None,
+              gru_param_attr=None,
               act=None,
               gate_act=None,
               gru_layer_attr=None):
     """
-    gru_group is a recurrent layer group version Gated Recurrent Unit. It
+    gru_group is a recurrent layer group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
-    benefit is that gru hidden sates are accessible to for the user. This is
-    especially useful in attention model. If you do not need to access to
-    any internal state, but merely use the outputs of a GRU, it is recommanded
+    benefit is that gru hidden states are accessible to the user. This is
+    especially useful in attention model. If you do not need to access
+    any internal state, but merely use the outputs of a GRU, it is recommended
     to use the grumemory, which is relatively faster.
 
     Please see grumemory in layers.py for more detail about the maths.
@@ -922,6 +925,7 @@ def gru_group(input,
             name=name,
             size=size,
             gru_bias_attr=gru_bias_attr,
+            gru_param_attr=gru_param_attr,
             act=act,
             gate_act=gate_act,
             gru_layer_attr=gru_layer_attr)
@@ -942,6 +946,7 @@ def simple_gru(input,
                mixed_bias_param_attr=None,
                mixed_layer_attr=None,
                gru_bias_attr=None,
+               gru_param_attr=None,
                act=None,
                gate_act=None,
                gru_layer_attr=None):
@@ -952,22 +957,22 @@ def simple_gru(input,
     use one complete layer to implement rnn (including simple rnn, gru and lstm)
     with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
     the multiplication operation :math:`W x_t` is not computed in these layers.
-    See details in their interfaces in layers.py. 
+    See details in their interfaces in layers.py.
     The other implementation is to use an recurrent group which can ensemble a
     series of layers to compute rnn step by step. This way is flexible for
     attenion mechanism or other complex connections.
 
     - gru_step_layer: only compute rnn by one step. It needs an memory as input
       and can be used in recurrent group.
-    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_unit: a wrapper of gru_step_layer with memory.
     - gru_group: a GRU cell implemented by a combination of multiple layers in
       recurrent group.
-      But :math:`W x_t` is not done in group.  
+      But :math:`W x_t` is not done in group.
     - gru_memory: a GRU cell implemented by one layer, which does same calculation
-      with gru_group and is faster than gru_group. 
-    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      with gru_group and is faster than gru_group.
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and
       gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
-      formula in grumemory. 
+      formula in grumemory.
 
     The computational speed is that, grumemory is relatively better than
     gru_group, and gru_group is relatively better than simple_gru.
@@ -1010,6 +1015,7 @@ def simple_gru(input,
         input=m,
         reverse=reverse,
         gru_bias_attr=gru_bias_attr,
+        gru_param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
         gru_layer_attr=gru_layer_attr)
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index d1a9843d326669711bf3d0769df1b804cfcfa673..93dd7796c246ae81a146759df7e0c19e334375f1 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,25 +1,18 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 add_test(NAME test_reset_hook
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-if (PROTOBUF_3)
-  add_paddle_exe(protobuf_equal
-    ProtobufEqualMain.cpp)
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
-    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-  )
-else()
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
-  )
-endif()
+add_paddle_exe(protobuf_equal
+  ProtobufEqualMain.cpp)
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 3f1d99701afe5425553feb129c7619b5e3e689fa..c9178e3c6a46a2d663ec368569e529e780b76a6f 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -3,7 +3,8 @@ export configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
+test_seq_concat_reshape)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index a54af94ce3db4ed300dee697b30516c3b6448d7c..ee5961af75ebb33af52f9add645f793015288f4e 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -10,13 +10,13 @@ protostr=$PWD/protostr
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index 3331c10d6497f58eb135208bd7abe48aacfb10ae..24c901c8ee3ab1c90fc14fbff761db06345a6313 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -7,8 +7,9 @@ x = layer_math.exp(x)
 x = layer_math.log(x)
 x = layer_math.abs(x)
 x = layer_math.sigmoid(x)
+x = layer_math.tanh(x)
 x = layer_math.square(x)
-x = layer_math.square(x)
+x = layer_math.relu(x)
 y = 1 + x
 y = y + 1
 y = x + y
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
index da8da1b541f37a09654202f68232b99e4dac9f61..9b8a2ad9687d313e6c5017c2d7331eddf539af92 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -65,13 +65,28 @@ layers {
     }
   }
 }
+layers {
+  name: "__tanh_0__"
+  type: "mixed"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__sigmoid_0__"
+    proj_conf {
+      type: "identity"
+      name: "___tanh_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
 layers {
   name: "__square_0__"
   type: "mixed"
   size: 100
   active_type: "square"
   inputs {
-    input_layer_name: "__sigmoid_0__"
+    input_layer_name: "__tanh_0__"
     proj_conf {
       type: "identity"
       name: "___square_0__.w0"
@@ -81,15 +96,15 @@ layers {
   }
 }
 layers {
-  name: "__square_1__"
+  name: "__relu_0__"
   type: "mixed"
   size: 100
-  active_type: "square"
+  active_type: "relu"
   inputs {
     input_layer_name: "__square_0__"
     proj_conf {
       type: "identity"
-      name: "___square_1__.w0"
+      name: "___relu_0__.w0"
       input_size: 100
       output_size: 100
     }
@@ -101,7 +116,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
   }
   slope: 1.0
   intercept: 1
@@ -123,7 +138,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
     proj_conf {
       type: "identity"
       name: "___mixed_0__.w0"
@@ -147,7 +162,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
   }
   slope: -1.0
   intercept: 0.0
@@ -339,8 +354,9 @@ sub_models {
   layer_names: "__log_0__"
   layer_names: "__abs_0__"
   layer_names: "__sigmoid_0__"
+  layer_names: "__tanh_0__"
   layer_names: "__square_0__"
-  layer_names: "__square_1__"
+  layer_names: "__relu_0__"
   layer_names: "__slope_intercept_layer_0__"
   layer_names: "__slope_intercept_layer_1__"
   layer_names: "__mixed_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..b6905824f0cb090375a38ff67e39fc626df0b2f6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
@@ -0,0 +1,295 @@
+type: "recurrent_nn"
+layers {
+  name: "data_a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "data_b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0___transform"
+  type: "mixed"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "data_a"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___simple_gru_0___transform.w0"
+      input_size: 100
+      output_size: 600
+    }
+  }
+}
+layers {
+  name: "__simple_gru_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+  type: "scatter_agent"
+  size: 600
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+  type: "gru_step"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+    input_parameter_name: "gru_param"
+  }
+  inputs {
+    input_layer_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  }
+  bias_parameter_name: "gru_bias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__simple_gru_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1___transform"
+  type: "mixed"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "data_b"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___simple_gru_1___transform.w0"
+      input_size: 100
+      output_size: 600
+    }
+  }
+}
+layers {
+  name: "__simple_gru_1___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+  type: "scatter_agent"
+  size: 600
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+  type: "gru_step"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+    input_parameter_name: "gru_param"
+  }
+  inputs {
+    input_layer_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  }
+  bias_parameter_name: "gru_bias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__simple_gru_1__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__simple_gru_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__simple_gru_1__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__last_seq_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__last_seq_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "mixed_param"
+  size: 60000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "gru_param"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "gru_bias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 2000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data_a"
+input_layer_names: "data_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "data_a"
+  layer_names: "data_b"
+  layer_names: "__simple_gru_0___transform"
+  layer_names: "__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__"
+  layer_names: "__simple_gru_1___transform"
+  layer_names: "__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "data_a"
+  input_layer_names: "data_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+  layer_names: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+    link_name: "__simple_gru_0__+delay1@__simple_gru_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__simple_gru_0___transform"
+    link_name: "__simple_gru_0___transform@__simple_gru_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__simple_gru_0__@__simple_gru_0___recurrent_group"
+    link_name: "__simple_gru_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+  layer_names: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+    link_name: "__simple_gru_1__+delay1@__simple_gru_1___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__simple_gru_1___transform"
+    link_name: "__simple_gru_1___transform@__simple_gru_1___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__simple_gru_1__@__simple_gru_1___recurrent_group"
+    link_name: "__simple_gru_1__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
index b30bbb2a4e24d74ebe1d6c8eda8be5aa09217f6d..c1bfdf1b19c61d096c25af061c6fbb3bbfc50265 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -79,7 +79,7 @@ layers {
   inputs {
     input_layer_name: "b"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__cos_sim_1__"
@@ -92,7 +92,7 @@ layers {
   inputs {
     input_layer_name: "c"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__sum_to_one_norm_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 41d2e2f2671f5c05425f9bd2e91d8adc33129761..3e9d28416ed5066461e960f0a9f085e057c28346 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -465,11 +465,11 @@ parameters {
   name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
   size: 30000
   initial_mean: 0.0
-  initial_std: 0.01
+  initial_std: 0.1
   dims: 100
   dims: 300
   initial_strategy: 0
-  initial_smart: false
+  initial_smart: true
 }
 parameters {
   name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..91284b4fb32fcfdbf6b9e7384ffe080574b78821
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data1"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data2"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__seqconcat_0__"
+  type: "seqconcat"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data1"
+  }
+  inputs {
+    input_layer_name: "data2"
+  }
+}
+layers {
+  name: "__seqreshape_0__"
+  type: "seqreshape"
+  size: 5
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data1"
+  }
+}
+input_layer_names: "data1"
+input_layer_names: "data2"
+output_layer_names: "__seqconcat_0__"
+output_layer_names: "__seqreshape_0__"
+sub_models {
+  name: "root"
+  layer_names: "data1"
+  layer_names: "data2"
+  layer_names: "__seqconcat_0__"
+  layer_names: "__seqreshape_0__"
+  input_layer_names: "data1"
+  input_layer_names: "data2"
+  output_layer_names: "__seqconcat_0__"
+  output_layer_names: "__seqreshape_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
index 1cfb92255aa92fa3fbc16a816851a5c2f81c2b56..569b0b945a762e8b596e197adc06df64e33311af 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
@@ -19,7 +19,7 @@ model_config {
 data_config {
   type: "py2"
   files: "train.list"
-  async_load_data: true
+  async_load_data: false
   for_test: false
   load_data_module: "a"
   load_data_object: "c"
@@ -58,7 +58,7 @@ opt_config {
 test_data_config {
   type: "py2"
   files: "test.list"
-  async_load_data: true
+  async_load_data: false
   for_test: true
   load_data_module: "b"
   load_data_object: "d"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index e984ee70625456241b3cfe6202fdadaa3807d33c..c8a3b190b19148ddb701020f5be55c4c29a17079 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -2,16 +2,18 @@
 cd `dirname $0`
 
 set -e
+PYTHON_EXEC=$1
+COMPARE_PROTO_UTIL=$2
 
 protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh
+./generate_protostr.sh ${PYTHON_EXEC}
 
 . ./file_list.sh
 
-if [ -z $1 ]; then
+if [ -z ${COMPARE_PROTO_UTIL} ]; then
   for file in $files
   do
       base_protostr=$protostr/$file
@@ -22,20 +24,20 @@ if [ -z $1 ]; then
 else
   for file in ${configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
 
   for file in ${whole_configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
new file mode 100644
index 0000000000000000000000000000000000000000..c19bb9685aa24c4d66e4f0bbbcb004507413dbe8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
@@ -0,0 +1,40 @@
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+
+mixed_param = ParamAttr(name='mixed_param')
+
+gru_param = ParamAttr(name='gru_param')
+gru_bias = ParamAttr(name='gru_bias', initial_mean=0., initial_std=0.)
+
+gru1 = simple_gru(
+    input=data_1,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+gru2 = simple_gru(
+    input=data_2,
+    size=200,
+    mixed_param_attr=mixed_param,
+    mixed_bias_param_attr=False,
+    gru_bias_attr=gru_bias,
+    gru_param_attr=gru_param)
+
+softmax_param = ParamAttr(name='softmax_param')
+
+predict = fc_layer(
+    input=[last_seq(input=gru1), last_seq(input=gru2)],
+    size=10,
+    param_attr=[softmax_param, softmax_param],
+    bias_attr=False,
+    act=SoftmaxActivation())
+outputs(
+    classification_cost(
+        input=predict, label=data_layer(
+            name='label', size=10)))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb5f13410dbbbaeea9e28c271d33a15fb3000dcf
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
@@ -0,0 +1,21 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2304, height=48, width=42)
+
+conv = img_conv_layer(
+    input=data,
+    filter_size=3,
+    num_channels=1,
+    num_filters=16,
+    padding=1,
+    act=LinearActivation(),
+    bias_attr=True)
+
+pool = img_pool_layer(
+    input=conv, num_channels=8, pool_size=2, stride=2, pool_type=MaxPooling())
+
+pad = pad_layer(input=pool, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+
+outputs(pad)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c161ba805fb301e8feb8702ad61a8341df40e3f
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
@@ -0,0 +1,12 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din1 = data_layer(name='data1', size=30)
+din2 = data_layer(name='data2', size=30)
+
+opts = []
+opts.append(seq_concat_layer(a=din1, b=din2))
+opts.append(seq_reshape_layer(input=din1, reshape_size=5))
+
+outputs(opts)
diff --git a/python/paddle/trainer_config_helpers/tests/layers_test_config.py b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
index ae275735aa2b852b3b226a4a0e5b2d4d000ba199..e6cd35ee761d1acd0b5c1943554c7ea1de6a13f5 100644
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -39,6 +39,7 @@ z1 = mixed_layer(
 assert z1.size > 0
 
 y2 = fc_layer(input=y, size=15)
+z2 = rotate_layer(input=y2, height=5, width=3)
 
 cos1 = cos_sim(a=x1, b=y1)
 cos3 = cos_sim(a=x1, b=y2, size=3)
@@ -46,7 +47,7 @@ cos3 = cos_sim(a=x1, b=y2, size=3)
 linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)
 
 out = fc_layer(
-    input=[cos1, cos3, linear_comb, z, z1],
+    input=[cos1, cos3, linear_comb, z, z1, z2],
     size=num_classes,
     act=SoftmaxActivation())
 
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..25526bf409cf82f26979a84700ce948ac969df0c
--- /dev/null
+++ b/python/paddle/v2/__init__.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import optimizer
+import layer
+import activation
+import parameters
+import trainer
+import event
+import data_type
+import topology
+import data_feeder
+import networks
+from . import dataset
+from . import reader
+import attr
+import pooling
+import inference
+import networks
+import py_paddle.swig_paddle as api
+import minibatch
+
+__all__ = [
+    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
+    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
+    'topology', 'networks', 'infer'
+]
+
+
+def init(**kwargs):
+    args = []
+    for key in kwargs.keys():
+        args.append('--%s=%s' % (key, str(kwargs[key])))
+
+    api.initPaddle(*args)
+
+
+infer = inference.infer
+batch = minibatch.batch
diff --git a/python/paddle/v2/activation.py b/python/paddle/v2/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..21261a178203b633ca6cf59a5fc89edc24a868b9
--- /dev/null
+++ b/python/paddle/v2/activation.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.activations
+import copy
+
+__all__ = []
+
+suffix = 'Activation'
+for act in paddle.trainer_config_helpers.activations.__all__:
+    new_name = act[:-len(suffix)]
+    globals()[new_name] = copy.copy(
+        getattr(paddle.trainer_config_helpers.activations, act))
+    globals()[new_name].__name__ = new_name
+    __all__.append(new_name)
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f78614e7f8abe7cffdc7a50a9fa77f1fc1a780
--- /dev/null
+++ b/python/paddle/v2/attr.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.attrs
+
+__all__ = [
+    "Param",
+    "Extra",
+]
+
+Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
+Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
+
+for each in paddle.trainer_config_helpers.attrs.__all__:
+    globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
+    __all__.append(each)
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec1d7bbdf912b940ca4b8e7b20eb11310f0e74f
--- /dev/null
+++ b/python/paddle/v2/config_base.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import re
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+import paddle.trainer_config_helpers as conf_helps
+
+
+class LayerType(type):
+    def __new__(cls, name, bases, attrs):
+        method_name = attrs.get('METHOD_NAME', None)
+        if method_name is not None:
+            method = getattr(conf_helps, method_name)
+            if method.__doc__ is not None:
+                mapper = attrs.get("__map_docstr__", None)
+                if mapper is not None:
+                    attrs['__doc__'] = LayerType.__map_docstr__(
+                        mapper(method.__doc__),
+                        method_name=method_name,
+                        name=name)
+                else:
+                    attrs['__doc__'] = LayerType.__map_docstr__(
+                        method.__doc__, method_name=method_name, name=name)
+        return super(LayerType, cls).__new__(cls, name, bases, attrs)
+
+    @staticmethod
+    def __map_docstr__(doc, name, method_name):
+        assert isinstance(doc, basestring)
+
+        # replace LayerOutput to paddle.v2.config_base.Layer
+        doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
+
+        doc = doc.replace('ParameterAttribute',
+                          'paddle.v2.attr.ParameterAttribute')
+
+        doc = re.sub(r'ExtraLayerAttribute[^\s]?',
+                     'paddle.v2.attr.ExtraAttribute', doc)
+
+        # xxx_layer to xxx
+        doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
+
+        # XxxxActivation to paddle.v2.Activation.Xxxx
+        doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
+                     r"paddle.v2.Activation.\g<name>", doc)
+
+        # TODO(yuyang18): Add more rules if needed.
+        return doc
+
+
+class Layer(object):
+    __metaclass__ = LayerType
+
+    def __init__(self, name=None, parent_layers=None):
+        assert isinstance(parent_layers, dict)
+        self.name = name
+        self.__contex__ = {}
+        self.__parent_layers__ = parent_layers
+
+    def to_proto(self, context):
+        """
+        function to set proto attribute
+        """
+        kwargs = dict()
+        for layer_name in self.__parent_layers__:
+            if not isinstance(self.__parent_layers__[layer_name],
+                              collections.Sequence):
+                v1_layer = self.__parent_layers__[layer_name].to_proto(
+                    context=context)
+            else:
+                v1_layer = map(lambda x: x.to_proto(context=context),
+                               self.__parent_layers__[layer_name])
+            kwargs[layer_name] = v1_layer
+
+        if self.context_name() is None:
+            return self.to_proto_impl(**kwargs)
+        elif self.context_name() not in context:
+            context[self.context_name()] = self.to_proto_impl(**kwargs)
+        self.__contex__ = context
+        if self.use_context_name():
+            return context[self.context_name()]
+        else:
+            return context[self.name]
+
+    def to_proto_impl(self, **kwargs):
+        raise NotImplementedError()
+
+    def context_name(self):
+        """
+        Context name means the context which stores `to_proto_impl` result.
+        If multiple layer share same context_name, the `to_proto_impl` of them
+        will be invoked only once.
+        """
+        return self.name
+
+    def use_context_name(self):
+        return False
+
+    def calculate_size(self):
+        """
+        lazy calculate size of the layer, should be called when to_proto_impl of
+        this layer is called.
+        :return:
+        """
+        return self.__contex__[self.context_name()].size
+
+
+def __convert_to_v2__(method_name, parent_names, is_default_name=True):
+    if is_default_name:
+        wrapper = wrap_name_default(name_prefix=method_name)
+    else:
+        wrapper = None
+
+    class V2LayerImpl(Layer):
+        METHOD_NAME = method_name
+
+        def __init__(self, **kwargs):
+            parent_layers = dict()
+            other_kwargs = dict()
+            for pname in parent_names:
+                if kwargs.has_key(pname):
+                    parent_layers[pname] = kwargs[pname]
+
+            for key in kwargs.keys():
+                if key not in parent_names:
+                    other_kwargs[key] = kwargs[key]
+
+            name = kwargs.get('name', None)
+            super(V2LayerImpl, self).__init__(name, parent_layers)
+            self.__other_kwargs__ = other_kwargs
+
+        if wrapper is not None:
+            __init__ = wrapper(__init__)
+
+        def to_proto_impl(self, **kwargs):
+            args = dict()
+            for each in kwargs:
+                args[each] = kwargs[each]
+            for each in self.__other_kwargs__:
+                args[each] = self.__other_kwargs__[each]
+            return getattr(conf_helps, method_name)(**args)
+
+    return V2LayerImpl
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba77fecf21eecf9115cc1b20720383b790294eb0
--- /dev/null
+++ b/python/paddle/v2/data_feeder.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import DataProviderConverter
+
+import paddle.trainer.PyDataProvider2 as pydp2
+
+__all__ = ['DataFeeder']
+
+
+def default_feeding_map(data_types):
+    reader_dict = dict()
+    for i, tp in enumerate(data_types):
+        reader_dict[tp[0]] = i
+    return reader_dict
+
+
+class DataFeeder(DataProviderConverter):
+    """
+    DataFeeder converts the data returned by paddle.reader into a data structure
+    of Arguments which is defined in the API. The paddle.reader usually returns
+    a list of mini-batch data entries. Each data entry in the list is one sample.
+    Each sample is a list or a tuple with one feature or multiple features.
+    DataFeeder converts this mini-batch data entries into Arguments in order
+    to feed it to C++ interface.
+    
+    The example usage:
+
+
+    ..  code-block:: python
+
+        data_types = [('image', paddle.data_type.dense_vector(784)),
+                      ('label', paddle.data_type.integer_value(10))]
+        reader_dict = {'image':0, 'label':1}
+        feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict)
+        minibatch_data = [
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
+                         ]
+        # or minibatch_data = [
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
+        #                     ]
+        arg = feeder(minibatch_data)
+
+    ..  note::
+
+        This module is for internal use only. Users should use the `reader`
+        interface.
+
+
+
+    :param data_types: A list to specify data name and type. Each item is
+                       a tuple of (data_name, data_type).
+
+    :type data_types: list
+    :param reader_dict: A dictionary to specify the position of each data
+                        in the input data.
+    :type feeding: dict
+    """
+
+    def __init__(self, data_types, feeding=None):
+        self.input_names = []
+        input_types = []
+        if feeding is None:
+            feeding = default_feeding_map(data_types)
+
+        self.feeding = feeding
+        for each in data_types:
+            self.input_names.append(each[0])
+            if not isinstance(each[1], pydp2.InputType):
+                raise TypeError("second item in each data_type should be an "
+                                "InputType")
+            input_types.append(each[1])
+        DataProviderConverter.__init__(self, input_types)
+
+    def convert(self, dat, argument=None):
+        """
+        :param dat: A list of mini-batch data. Each sample is a list or tuple
+                    one feature or multiple features.
+
+        :type dat: list
+        :param argument: An Arguments object contains this mini-batch data with
+                         one or multiple features. The Arguments definition is
+                         in the API.
+        :type argument: py_paddle.swig_paddle.Arguments
+        """
+
+        def reorder_data(data):
+            retv = []
+            for each in data:
+                reorder = []
+                for name in self.input_names:
+                    reorder.append(each[self.feeding[name]])
+                retv.append(reorder)
+            return retv
+
+        return DataProviderConverter.convert(self, reorder_data(dat), argument)
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..d582f76ddf01ed3430a1d075624bbb8e0bf3f2a9
--- /dev/null
+++ b/python/paddle/v2/data_type.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.PyDataProvider2 as pydp2
+
+import_list = [
+    nm for nm in dir(pydp2)
+    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm)
+]
+import_list.extend(['InputType'])
+
+for nm in import_list:
+    globals()[nm] = getattr(pydp2, nm)
+
+__all__ = import_list
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ff6295c34e853d8f69b9e78719af23a56d1fbb
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+
+__all__ = [
+    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
+    'uci_housing', 'wmt14'
+]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f7a830ee60a331b55a1e218923e690103e1c5b
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
+
+TODO(yuyang18): Complete the comments.
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..3021b68ddb02ecaa874e21681796c0912ad4cc06
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import shutil
+import sys
+
+__all__ = ['DATA_HOME', 'download', 'md5file']
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+if not os.path.exists(DATA_HOME):
+    os.makedirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname, url.split('/')[-1])
+    if not (os.path.exists(filename) and md5file(filename) == md5sum):
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def dict_add(a_dict, ele):
+    if ele in a_dict:
+        a_dict[ele] += 1
+    else:
+        a_dict[ele] = 1
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eab49ee39325c1c60fc511e0bd834e83aa987f0
--- /dev/null
+++ b/python/paddle/v2/dataset/conll05.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+import gzip
+import itertools
+from common import download
+"""
+Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this
+dataset as an example. Because Conll 2005 is not free in public, the default
+downloaded URL is test set of Conll 2005 (which is public). Users can change
+URL and MD5 to their Conll dataset.
+
+TODO(yuyang18): Complete comments.
+"""
+
+__all__ = ['test, get_dict', 'get_embedding']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
+    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
+    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    return download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..76019d9f54020ff6f02c17eb6047cbd014a8ccf2
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+
+TODO(yuyang18): Complete comments.
+"""
+
+import paddle.v2.dataset.common
+import tarfile
+import Queue
+import re
+import string
+import threading
+
+__all__ = ['build_dict', 'train', 'test']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+# Read files that match pattern.  Tokenize and yield each file.
+def tokenize(pattern):
+    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+                                                        MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    word_freq = {}
+    for doc in tokenize(pattern):
+        for word in doc:
+            paddle.v2.dataset.common.dict_add(word_freq, word)
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+    UNK = word_idx['<unk>']
+
+    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
+
+    def load(pattern, queue):
+        for doc in tokenize(pattern):
+            queue.put(doc)
+        queue.put(None)
+
+    def reader():
+        # Creates two threads that loads positive and negative samples
+        # into qs.
+        t0 = threading.Thread(
+            target=load, args=(
+                pos_pattern,
+                qs[0], ))
+        t0.daemon = True
+        t0.start()
+
+        t1 = threading.Thread(
+            target=load, args=(
+                neg_pattern,
+                qs[1], ))
+        t1.daemon = True
+        t1.start()
+
+        # Read alternatively from qs[0] and qs[1].
+        i = 0
+        doc = qs[i].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            i += 1
+            doc = qs[i % 2].get()
+
+        # If any queue is empty, reads from the other queue.
+        i += 1
+        doc = qs[i % 2].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            doc = qs[i % 2].get()
+
+    return reader()
+
+
+def train(word_idx):
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+
+
+def test(word_idx):
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
+
+
+def word_dict():
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c160f111d09d61eb860c7f02552e635f2400a7
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
+
+Complete comments.
+"""
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+def word_count(f, word_freq=None):
+    add = paddle.v2.dataset.common.dict_add
+    if word_freq == None:
+        word_freq = {}
+
+    for l in f:
+        for w in l.strip().split():
+            add(word_freq, w)
+        add(word_freq, '<s>')
+        add(word_freq, '<e>')
+
+    return word_freq
+
+
+def build_dict():
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.v2.dataset.common.download(
+                paddle.v2.dataset.imikolov.URL, 'imikolov',
+                paddle.v2.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        TYPO_FREQ = 50
+        word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n):
+    def reader():
+        with tarfile.open(
+                paddle.v2.dataset.common.download(
+                    paddle.v2.dataset.imikolov.URL, 'imikolov',
+                    paddle.v2.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                l = ['<s>'] + l.strip().split() + ['<e>']
+                if len(l) >= n:
+                    l = [word_idx.get(w, UNK) for w in l]
+                    for i in range(n, len(l) + 1):
+                        yield tuple(l[i - n:i])
+
+    return reader
+
+
+def train(word_idx, n):
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
+
+
+def test(word_idx, n):
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f2fcb99de4cb1971a7375a97b5daa209ee95ef
--- /dev/null
+++ b/python/paddle/v2/dataset/mnist.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse train set and test set into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST train set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Train reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                          TRAIN_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                          TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set cretor.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
+                                          TEST_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
+                                          TEST_LABEL_MD5), 100)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc65e8f8b6f04b078a3449c622478095086cecbe
--- /dev/null
+++ b/python/paddle/v2/dataset/movielens.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+TODO(yuyang18): Complete comments.
+"""
+
+import zipfile
+from common import download
+import re
+import random
+import functools
+
+__all__ = ['train_creator', 'test_creator']
+
+
+class MovieInfo(object):
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+
+class UserInfo(object):
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = [1, 18, 25, 35, 45, 50, 56].index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = download(
+        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
+        md5='c4d9eecfca2ab87c1945afe126590906')
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train_creator = functools.partial(__reader_creator__, is_test=False)
+test_creator = functools.partial(__reader_creator__, is_test=True)
+
+
+def unittest():
+    for train_count, _ in enumerate(train_creator()()):
+        pass
+    for test_count, _ in enumerate(test_creator()()):
+        pass
+
+    print train_count, test_count
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..71689fd61b6b14a7b5072caff4e2fd48a7f74072
--- /dev/null
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -0,0 +1,127 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import common
+
+__all__ = ['train', 'test', 'get_word_dict']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default train set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5babcef0eb4345d243904877d323c37d4889a643
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.common
+import unittest
+import tempfile
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.v2.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.v2.dataset.common.download(
+                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..009e55243a594e5e235c36fb0223ec70754d17f3
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -0,0 +1,26 @@
+import paddle.v2.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.v2.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..27f454b137e3a40febd19cf085e2f4034cc16b24
--- /dev/null
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+TODO(yuyang18): Complete comments.
+"""
+
+import numpy as np
+import os
+from common import download
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    global UCI_TRAIN_DATA
+    load_data(download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    global UCI_TEST_DATA
+    load_data(download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5a16d51477f9cfbf0cd32af54098406fbbd2b41
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+wmt14 dataset
+"""
+import tarfile
+
+import paddle.v2.dataset.common
+
+__all__ = ['train', 'test', 'build_dict']
+
+URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and will be add later.
+URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict__(tar_file, dict_size):
+    def __to_dict__(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad52b8baa411269d29732685871a875df5185cc
--- /dev/null
+++ b/python/paddle/v2/event.py
@@ -0,0 +1,84 @@
+"""
+All training events.
+
+There are:
+
+* BeginIteration
+* EndIteration
+* BeginPass
+* EndPass
+
+TODO(yuyang18): Complete it!
+"""
+import py_paddle.swig_paddle as api
+
+__all__ = [
+    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
+]
+
+
+class WithMetric(object):
+    def __init__(self, evaluator):
+        if not isinstance(evaluator, api.Evaluator):
+            raise TypeError("Evaluator should be api.Evaluator type")
+        self.__evaluator__ = evaluator
+
+    @property
+    def metrics(self):
+        names = self.__evaluator__.getNames()
+        retv = dict()
+        for each_name in names:
+            val = self.__evaluator__.getValue(each_name)
+            retv[each_name] = val
+        return retv
+
+
+class TestResult(WithMetric):
+    """
+    Result that trainer.test return.
+    """
+
+    def __init__(self, evaluator, cost):
+        super(TestResult, self).__init__(evaluator)
+        self.cost = cost
+
+
+class BeginPass(object):
+    """
+    Event On One Pass Training Start.
+    """
+
+    def __init__(self, pass_id):
+        self.pass_id = pass_id
+
+
+class EndPass(WithMetric):
+    """
+    Event On One Pass Training Complete.
+    """
+
+    def __init__(self, pass_id, evaluator):
+        self.pass_id = pass_id
+        WithMetric.__init__(self, evaluator)
+
+
+class BeginIteration(object):
+    """
+    Event On One Batch Training Start.
+    """
+
+    def __init__(self, pass_id, batch_id):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+
+
+class EndIteration(WithMetric):
+    """
+    Event On One Batch Training Complete.
+    """
+
+    def __init__(self, pass_id, batch_id, cost, evaluator):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+        self.cost = cost
+        WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d889bce7fe5ded22755a527575595f375691df4
--- /dev/null
+++ b/python/paddle/v2/inference.py
@@ -0,0 +1,51 @@
+import py_paddle.swig_paddle as api
+
+import topology
+from data_feeder import DataFeeder
+import itertools
+import numpy
+
+__all__ = ['infer']
+
+
+class Inference(object):
+    def __init__(self, output, parameters):
+        topo = topology.Topology(output)
+        gm = api.GradientMachine.createFromConfigProto(
+            topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
+        for param in gm.getParameters():
+            val = param.getBuf(api.PARAMETER_VALUE)
+            name = param.getName()
+            assert isinstance(val, api.Vector)
+            val.copyFromNumpyArray(parameters.get(name).flatten())
+        self.__gradient_machine__ = gm
+        self.__data_types__ = topo.data_type()
+
+    def iter_infer(self, reader, feeding=None):
+        feeder = DataFeeder(self.__data_types__, feeding)
+        self.__gradient_machine__.start()
+        for data_batch in reader():
+            yield self.__gradient_machine__.forwardTest(feeder(data_batch))
+        self.__gradient_machine__.finish()
+
+    def iter_infer_field(self, field, **kwargs):
+        for result in self.iter_infer(**kwargs):
+            yield [each_result[field] for each_result in result]
+
+    def infer(self, field='value', **kwargs):
+        retv = None
+        for result in self.iter_infer_field(field=field, **kwargs):
+            if retv is None:
+                retv = [[]] * len(result)
+            for i, item in enumerate(result):
+                retv[i].append(item)
+        retv = [numpy.concatenate(out) for out in retv]
+        if len(retv) == 1:
+            return retv[0]
+        else:
+            return retv
+
+
+def infer(output, parameters, reader, feeding=None, field='value'):
+    inferer = Inference(output=output, parameters=parameters)
+    return inferer.infer(field=field, reader=reader, feeding=feeding)
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4efedde363f20fde168941adcb6e8a594b533a
--- /dev/null
+++ b/python/paddle/v2/layer.py
@@ -0,0 +1,498 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+`paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
+we want to make Paddle a plain Python package. The model config package defined
+the way how to configure a neural network topology in Paddle Python code.
+
+The primary usage shows below.
+
+..  code-block:: python
+
+    import paddle.v2 as paddle
+
+    img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
+    hidden = paddle.layer.fc(input=img, size=200)
+    prediction = paddle.layer.fc(input=hidden, size=10,
+                                 act=paddle.activation.Softmax())
+
+    # use prediction instance where needed.
+    parameters = paddle.parameters.create(cost)
+"""
+
+import collections
+import inspect
+from config_base import Layer, __convert_to_v2__
+import paddle.trainer_config_helpers as conf_helps
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as __parse__
+from paddle.trainer_config_helpers.default_decorators import wrap_act_default
+from paddle.trainer_config_helpers.default_decorators import \
+    wrap_bias_attr_default
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+from paddle.trainer_config_helpers.layers import layer_support
+from paddle.trainer.config_parser import \
+    RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
+    RecurrentLayerGroupEnd, model_type
+
+import activation
+import re
+import data_type
+
+__all__ = ['parse_network', 'data']
+
+
+def parse_network(*outputs):
+    """
+    Parse all output layers and then generate a ModelConfig object.
+
+    ..  note::
+
+        This function is used internally in paddle.v2 module. User should never
+        invoke this method.
+
+    :param outputs: Output layers.
+    :type outputs: Layer
+    :return: A ModelConfig object instance.
+    :rtype: ModelConfig
+    """
+
+    def __real_func__():
+        """
+        __real_func__ is the function that config_parser.parse invoked. It is
+        the plain old paddle configuration function.
+        """
+        context = dict()
+        real_output = [each.to_proto(context=context) for each in outputs]
+        conf_helps.outputs(real_output)
+
+    return __parse__(__real_func__)
+
+
+"""
+Some layer may need some special config, and can not use __convert_to_v2__ to convert.
+So we also need to implement some special LayerV2.
+"""
+
+
+class DataLayerV2(Layer):
+    METHOD_NAME = 'data_layer'
+
+    def __init__(self, name, type, **kwargs):
+        assert isinstance(type, data_type.InputType)
+
+        self.type = type
+        self.__method_name__ = 'data_layer'
+        self.__kwargs__ = kwargs
+
+        super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        args['size'] = self.type.dim
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+        return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
+
+    def __map_docstr__(doc):
+        doc = re.sub(r'(data = [^\)]+)\).*',
+                     "data = paddle.layer.data(name=\"input\", "
+                     "type=paddle.data_type.dense_vector(1000))", doc)
+
+        doc = re.sub(r':param size:.*',
+                     ':param type: Data type of this data layer', doc)
+        doc = re.sub(r':type size:.*',
+                     ":type size: paddle.v2.data_type.InputType", doc)
+        return doc
+
+
+class WithExtraParent(Layer):
+    def extra_parent(self):
+        return self.__extra_parent__
+
+    def __init__(self, name=None, parent_layers=None):
+        self.__extra_parent__ = []
+        super(WithExtraParent, self).__init__(
+            name=name, parent_layers=parent_layers)
+
+    def append_extra_parent(self, parent):
+        self.__extra_parent__.append(parent)
+
+    def to_proto(self, context):
+        """
+        function to set proto attribute
+        """
+        kwargs = dict()
+        for p in self.__extra_parent__:
+            p.to_proto(context=context)
+
+        for layer_name in self.__parent_layers__:
+            if not isinstance(self.__parent_layers__[layer_name],
+                              collections.Sequence):
+                v1_layer = self.__parent_layers__[layer_name].to_proto(
+                    context=context)
+            else:
+                v1_layer = map(lambda x: x.to_proto(context=context),
+                               self.__parent_layers__[layer_name])
+            kwargs[layer_name] = v1_layer
+
+        if self.context_name() is None:
+            return self.to_proto_impl(context=context, **kwargs)
+        elif self.context_name() not in context:
+            context[self.context_name()] = self.to_proto_impl(
+                context=context, **kwargs)
+
+        if self.use_context_name():
+            return context[self.context_name()]
+        else:
+            return context[self.name]
+
+
+class MemoryV2(WithExtraParent):
+    def __init__(self, name, **kwargs):
+        self.name = name
+        super(MemoryV2, self).__init__(name=name, parent_layers=dict())
+        self.__kwargs__ = kwargs
+        self.__boot_layer_name__ = None
+        if 'boot_layer' in kwargs:
+            begin_of_current_rnn = []
+            # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
+            # function inside step.
+            st = inspect.stack()
+            for i in xrange(len(st)):
+                locs = inspect.stack()[i][0].f_locals
+                keys = locs.keys()
+                for key in keys:
+                    val = locs[key]
+                    if isinstance(val, RecurrentLayerInput):
+                        begin_of_current_rnn.append(val)
+                    elif isinstance(val, collections.Sequence):
+                        for v in val:
+                            if isinstance(v, RecurrentLayerInput):
+                                begin_of_current_rnn.append(v)
+
+                if begin_of_current_rnn:
+                    break
+            assert begin_of_current_rnn is not None
+            for extra in begin_of_current_rnn:
+                self.append_extra_parent(extra)
+                assert isinstance(extra, WithExtraParent)
+                extra.append_extra_parent(kwargs['boot_layer'])
+                self.__boot_layer_name__ = kwargs['boot_layer'].name
+
+    def to_proto_impl(self, context, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+
+        if self.__boot_layer_name__ is not None:
+            args['boot_layer'] = context[self.__boot_layer_name__]
+
+        size = args.get('size', None)
+        if size is not None:
+            if callable(size):
+                real_size = size()
+            else:
+                real_size = size
+            args['size'] = real_size
+        return conf_helps.memory(name=self.name, **args)
+
+    def context_name(self):
+        return self.name + "#memory"
+
+    def use_context_name(self):
+        """
+        memory layer will have the same name with some layer
+        :return:
+        """
+        return True
+
+
+class LayerOutputV2(Layer):
+    """
+    LayerOutputV2 is used to store the result of LayerOutput in v1 api.
+    It will not store it's parents because layer_output has been parsed already.
+    """
+
+    def __init__(self, layer_output):
+        assert isinstance(layer_output, conf_helps.LayerOutput)
+        self.layer_output = layer_output
+        super(LayerOutputV2, self).__init__(
+            name=layer_output.name, parent_layers=dict())
+
+    def to_proto_impl(self):
+        return self.layer_output
+
+
+class StaticInputV2(object):
+    def __init__(self, input, is_seq=False, size=None):
+        assert isinstance(input, LayerV2)
+        self.name = input.name
+        self.input = input
+        self.is_seq = is_seq
+        self.size = size
+        # TODO(add size check)
+        # assert input.size is not None or size is not None
+
+
+class MixedLayerV2(Layer):
+    """
+    This class is use to support `with` grammar. If not, the following code
+    could convert mixed_layer simply.
+
+        mixed = __convert_to_v2__(
+            'mixed_layer', name_prefix='mixed', parent_names=['input'])
+    """
+
+    class AddToSealedMixedLayerExceptionV2(Exception):
+        pass
+
+    def __init__(self,
+                 size=0,
+                 input=None,
+                 name=None,
+                 act=None,
+                 bias_attr=None,
+                 layer_attr=None):
+        self.__method_name__ = 'mixed_layer'
+        self.finalized = False
+        self.__inputs__ = []
+        if input is not None:
+            self.__inputs__ = input
+
+        other_kwargs = dict()
+        other_kwargs['name'] = name
+        other_kwargs['size'] = size
+        other_kwargs['act'] = act
+        other_kwargs['bias_attr'] = bias_attr
+        other_kwargs['layer_attr'] = layer_attr
+        parent_layers = {"input": self.__inputs__}
+        super(MixedLayerV2, self).__init__(name, parent_layers)
+        self.__other_kwargs__ = other_kwargs
+
+    def __iadd__(self, other):
+        if not self.finalized:
+            self.__inputs__.append(other)
+            return self
+        else:
+            raise MixedLayerV2.AddToSealedMixedLayerExceptionV2()
+
+    def __enter__(self):
+        assert len(self.__inputs__) == 0
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.finalized = True
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__other_kwargs__:
+            args[each] = self.__other_kwargs__[each]
+        size = args.get('size', None)
+        if size is not None:
+            if callable(size):
+                real_size = size()
+            else:
+                real_size = size
+            args['size'] = real_size
+        return getattr(conf_helps, self.__method_name__)(**args)
+
+
+@wrap_name_default("mixed")
+@wrap_act_default(act=activation.Linear())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support(conf_helps.layers.ERROR_CLIPPING, conf_helps.layers.DROPOUT)
+def mixed(size=0,
+          name=None,
+          input=None,
+          act=None,
+          bias_attr=False,
+          layer_attr=None):
+    return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
+
+
+class RecurrentLayerInput(WithExtraParent):
+    def __init__(self, recurrent_name, index, parent_layers):
+        assert len(parent_layers) == 1
+        self.__parents__ = parent_layers.values()[0]
+        super(RecurrentLayerInput, self).__init__(
+            name=self.__parents__[index].name, parent_layers=parent_layers)
+        self.__recurrent_name__ = recurrent_name
+
+    def context_name(self):
+        return self.__recurrent_name__ + ".begin"
+
+    def to_proto_impl(self, context, **kwargs):
+        model_type('recurrent_nn')
+        RecurrentLayerGroupWithoutOutLinksBegin(
+            name=self.__recurrent_name__,
+            in_links=map(lambda x: x.name, self.__parents__))
+        return self
+
+
+class RecurrentLayerOutput(Layer):
+    def __init__(self, recurrent_name, index, parent_layers):
+        assert len(parent_layers) == 1
+        self.__parents__ = parent_layers.values()[0]
+        super(RecurrentLayerOutput, self).__init__(
+            name=self.__parents__[index].name, parent_layers=parent_layers)
+        self.__recurrent_name__ = recurrent_name
+
+    def context_name(self):
+        return self.__recurrent_name__ + ".end"
+
+    def to_proto_impl(self, **kwargs):
+        for l in self.__parents__:
+            RecurrentLayerGroupSetOutLink(l.name)
+        RecurrentLayerGroupEnd(name=self.__recurrent_name__)
+
+
+LayerV2 = Layer
+data = DataLayerV2
+data.__name__ = 'data'
+AggregateLevel = conf_helps.layers.AggregateLevel
+ExpandLevel = conf_helps.layers.ExpandLevel
+memory = MemoryV2
+
+
+def __layer_name_mapping__(inname):
+    if inname in ['data_layer', 'memory', 'mixed_layer', 'recurrent_group']:
+        # Do Not handle these layers
+        return
+    elif inname == 'maxid_layer':
+        return 'max_id'
+    elif inname.endswith('memory') or inname.endswith(
+            '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
+        return inname
+    elif inname in [
+            'cross_entropy', 'multi_binary_label_cross_entropy',
+            'cross_entropy_with_selfnorm'
+    ]:
+        return inname + "_cost"
+    elif inname.endswith('_cost'):
+        return inname
+    elif inname.endswith("_layer"):
+        return inname[:-len("_layer")]
+
+
+def __layer_name_mapping_parent_names__(inname):
+    all_args = getattr(conf_helps, inname).argspec.args
+    return filter(
+        lambda x: x in ['input1', 'input2', 'label', 'input', 'a', 'b',
+                        'expand_as',
+                        'weights', 'vectors', 'weight', 'score', 'left',
+                        'right', 'output_mem'],
+        all_args)
+
+
+def __convert_layer__(_new_name_, _old_name_, _parent_names_):
+    global __all__
+    __all__.append(_new_name_)
+    globals()[new_name] = __convert_to_v2__(_old_name_, _parent_names_)
+    globals()[new_name].__name__ = new_name
+
+
+for each_layer_name in dir(conf_helps):
+    new_name = __layer_name_mapping__(each_layer_name)
+    if new_name is not None:
+        parent_names = __layer_name_mapping_parent_names__(each_layer_name)
+        assert len(parent_names) != 0, each_layer_name
+        __convert_layer__(new_name, each_layer_name, parent_names)
+
+del parent_names
+del new_name
+del each_layer_name
+
+
+@wrap_name_default()
+def recurrent_group(step, input, name=None):
+    if not isinstance(input, collections.Sequence):
+        input = [input]
+
+    non_static_inputs = filter(lambda x: not isinstance(x, StaticInputV2),
+                               input)
+    actual_input = [
+        RecurrentLayerInput(
+            recurrent_name=name,
+            index=i,
+            parent_layers={'recurrent_inputs': non_static_inputs})
+        for i in xrange(len(non_static_inputs))
+    ]
+
+    def __real_step__(*args):
+        rnn_input = list(args)
+        static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
+        for static_input in static_inputs:
+            mem_name = "__%s_memory__" % static_input.input.name
+            mem = memory(
+                name=mem_name,
+                is_seq=static_input.is_seq,
+                size=static_input.input.calculate_size,
+                boot_layer=static_input.input)
+            with mixed(
+                    name=mem_name,
+                    size=static_input.input.calculate_size,
+                    act=activation.Identity()) as mix:
+                mix += identity_projection(input=mem)
+            rnn_input.insert(input.index(static_input), mix)
+        return step(*rnn_input)
+
+    actual_output = __real_step__(*actual_input)
+
+    if not isinstance(actual_output, collections.Sequence):
+        actual_output = [actual_output]
+
+    retv = [
+        RecurrentLayerOutput(
+            recurrent_name=name,
+            index=i,
+            parent_layers={'recurrent_outputs': actual_output})
+        for i in xrange(len(actual_output))
+    ]
+    if len(retv) == 1:
+        return retv[0]
+    else:
+        return retv
+
+
+__projection_names__ = filter(lambda x: x.endswith('_projection'),
+                              dir(conf_helps))
+
+__all__ += __projection_names__
+
+__operator_names__ = filter(lambda x: x.endswith('_operator'), dir(conf_helps))
+__all__ += __operator_names__
+
+# convert projection
+for prj in __projection_names__:
+    globals()[prj] = __convert_to_v2__(
+        prj, parent_names=['input'], is_default_name=False)
+    globals()[prj].__name__ = prj
+
+# convert operator
+operator_list = [
+    # [V1_method_name, parent_names],
+    ['dotmul_operator', ['a', 'b']],
+    ['conv_operator', ['img', 'filter']]
+]
+for op in operator_list:
+    globals()[op[0]] = __convert_to_v2__(
+        op[0], parent_names=op[1], is_default_name=False)
+    globals()[op[0]].__name__ = op[0]
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb
--- /dev/null
+++ b/python/paddle/v2/minibatch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e6644196c8242cc3fed7a4fb1503697e5b59ffb
--- /dev/null
+++ b/python/paddle/v2/networks.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.networks as conf_nw
+import inspect
+from config_base import __convert_to_v2__
+
+__all__ = []
+
+
+def __initialize__():
+    for each_subnetwork in conf_nw.__all__:
+        if each_subnetwork in ['inputs', 'outputs']:
+            continue
+        func = getattr(conf_nw, each_subnetwork)
+        if hasattr(func, 'argspec'):
+            argspec = func.argspec
+        else:
+            argspec = inspect.getargspec(func)
+        if each_subnetwork == 'simple_attention':
+            parents = ['encoded_sequence', 'encoded_proj', 'decoder_state']
+        else:
+            parents = filter(lambda x: x.startswith('input'), argspec.args)
+        assert len(parents) != 0, each_subnetwork
+        v2_subnet = __convert_to_v2__(
+            each_subnetwork,
+            parent_names=parents,
+            is_default_name='name' in argspec.args)
+        globals()[each_subnetwork] = v2_subnet
+        globals()[each_subnetwork].__name__ = each_subnetwork
+        global __all__
+        __all__.append(each_subnetwork)
+
+
+__initialize__()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a01d95c205c0626374e1814a170ce2d58f23a60
--- /dev/null
+++ b/python/paddle/v2/optimizer.py
@@ -0,0 +1,112 @@
+import py_paddle.swig_paddle as swig_api
+
+import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+"""
+Optimizers(update equation) for SGD method.
+
+TODO(yuyang18): Complete comments.
+"""
+
+__all__ = [
+    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
+    'RMSProp', 'ModelAverage', 'L2Regularization'
+]
+
+
+class Optimizer(object):
+    def __init__(self, **kwargs):
+        if 'batch_size' in kwargs:
+            del kwargs['batch_size']  # not important for python library.
+
+        def __impl__():
+            v1_optimizers.settings(batch_size=1, **kwargs)
+
+        self.__opt_conf_proto__ = config_parser_utils.parse_optimizer_config(
+            __impl__)
+        self.__opt_conf__ = swig_api.OptimizationConfig.createFromProto(
+            self.__opt_conf_proto__)
+
+    def enable_types(self):
+        """
+        get enable_types for each optimizer.
+        enable_types = [value, gradient, momentum, etc]
+        For each optimizer(SGD, Adam), GradientMachine should enable different
+        buffers.
+        """
+        tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
+        assert isinstance(tmp, swig_api.ParameterOptimizer)
+        return tmp.getParameterTypes()
+
+    def create_local_updater(self):
+        return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
+
+    def create_remote_updater(self, pass_num):
+        return swig_api.ParameterUpdater.createRemoteUpdater(self.__opt_conf__,
+                                                             pass_num)
+
+
+class Momentum(Optimizer):
+    def __init__(self, momentum=None, sparse=False, **kwargs):
+        learning_method = v1_optimizers.MomentumOptimizer(
+            momentum=momentum, sparse=sparse)
+        super(Momentum, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class Adam(Optimizer):
+    def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
+        learning_method = v1_optimizers.AdamOptimizer(
+            beta1=beta1, beta2=beta2, epsilon=epsilon)
+        super(Adam, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class Adamax(Optimizer):
+    def __init__(self, beta1=0.9, beta2=0.999, **kwargs):
+        learning_method = v1_optimizers.AdamaxOptimizer(
+            beta1=beta1, beta2=beta2)
+        super(Adamax, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class AdaGrad(Optimizer):
+    def __init__(self, **kwargs):
+        learning_method = v1_optimizers.AdaGradOptimizer()
+        super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class DecayedAdaGrad(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.DecayedAdaGradOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(DecayedAdaGrad, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class AdaDelta(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.AdaDeltaOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(AdaDelta, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class RMSProp(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
+        learning_method = v1_optimizers.RMSPropOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(RMSProp, self).__init__(learning_method=learning_method, **kwargs)
+
+
+ModelAverage = v1_optimizers.ModelAverage
+L2Regularization = v1_optimizers.L2Regularization
+
+if __name__ == '__main__':
+    swig_api.initPaddle('--use_gpu=false')
+    for opt in [
+            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
+            AdaDelta(), RMSProp(), Adam(
+                model_average=ModelAverage(average_window=0.5),
+                regularization=L2Regularization(rate=0.5),
+                gradient_clipping_threshold=25)
+    ]:
+        print opt, opt.enable_types()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..05dc5c68dd97b00fb15b74564a32313430c45345
--- /dev/null
+++ b/python/paddle/v2/parameters.py
@@ -0,0 +1,336 @@
+import numpy as np
+import py_paddle.swig_paddle as api
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import struct
+import tarfile
+import cStringIO
+from topology import Topology
+
+__all__ = ['Parameters', 'create']
+
+
+def create(layers):
+    """
+    Create parameter pool by topology.
+
+    :param layers:
+    :return:
+    """
+    topology = Topology(layers)
+    pool = Parameters()
+    for param in topology.proto().parameters:
+        pool.__append_config__(param)
+    return pool
+
+
+class Parameters(object):
+    """
+    Parameters is a dictionary contains Paddle's parameter. The key of
+    Parameters is the name of parameter. The value of Parameters is a plain
+    :code:`numpy.ndarry` .
+
+    Basically usage is
+
+    ..  code-block:: python
+
+        data = paddle.layers.data(...)
+        ...
+        out = paddle.layers.fc(...)
+
+        parameters = paddle.parameters.create(out)
+
+        parameter_names = parameters.names()
+        fc_mat = parameters.get('fc')
+        print fc_mat
+    """
+
+    def __init__(self):
+        self.__param_conf__ = dict()
+        self.__gradient_machines__ = []
+        self.__tmp_params__ = []
+
+    def __append_config__(self, param_conf):
+        """
+        Append a parameter configuration. It used to initialize Parameters and
+        should be invoked only in paddle.parameters.create
+
+        :param param_conf: The parameter configuration in protobuf
+        :type param_conf: ParameterConfig
+        :return: Nothing
+        """
+
+        if not isinstance(param_conf, ParameterConfig):
+            raise ValueError("param_conf must be paddle.proto.ParameterConfig")
+
+        if param_conf.name in self.__param_conf__:
+            raise ValueError("duplicated parameter %s" % param_conf.name)
+
+        self.__param_conf__[param_conf.name] = param_conf
+
+    def keys(self):
+        """
+        keys are the names of each parameter.
+
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.__param_conf__.keys()
+
+    def names(self):
+        """
+        names of each parameter.
+
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.keys()
+
+    def has_key(self, key):
+        """
+        has_key return true if there are such parameter name == key
+
+        :param key: Parameter name
+        :type key: basestring
+        :return: True if contains such key
+        """
+        return key in self.__param_conf__.keys()
+
+    def __iter__(self):
+        """
+        Return an iterator of parameter name. It is used by `for loop`
+        or `in` operator.
+
+        ..  code-block:: python
+
+            parameters = paddle.parameters.create(...)
+            if "fc_param" in parameters:
+                print 'OK'
+        :return: an iterator of parameter name
+        :rtype: iterator
+        """
+        return iter(self.__param_conf__)
+
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        shape = self.get_shape(key)
+
+        if len(self.__gradient_machines__) == 0:
+            # create new parameter in python numpy.
+            if len(self.__tmp_params__) != 0:
+                ret_list = [
+                    mat for name, mat in self.__tmp_params__ if name == key
+                ]
+                if len(ret_list) == 1:
+                    return ret_list[0]
+            return np.ndarray(shape=shape, dtype=np.float32)
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                param = __get_parameter_in_gradient_machine__(
+                    each_gradient_machine, key)
+                # for simplify implementation now, we always copy from C++
+                assert isinstance(param, api.Parameter)
+                val = param.getBuf(api.PARAMETER_VALUE)
+                assert isinstance(val, api.Vector)
+                val = val.copyToNumpyArray()
+                return val
+                # else continue
+
+            raise RuntimeError("Unexpected branch")
+
+    def get_shape(self, key):
+        """
+        get shape of the parameter.
+
+        :param key: parameter name
+        :type key: basestring
+        :return: parameter's shape
+        :rtype: tuple
+        """
+        if not isinstance(key, basestring):
+            raise ValueError("parameter name should be string")
+        if not self.has_key(key):
+            raise ValueError("No such parameter %s" % key)
+        conf = self.__param_conf__[key]
+        return tuple(map(int, conf.dims))
+
+    def __setitem__(self, key, value):
+        """
+        Set parameter by parameter name & value. It use Python dict syntax.
+
+        :note: It will always copy the parameter to C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :param value: Parameter matrix.
+        :type value: np.ndarray
+        :return: Nothing
+        """
+
+        if not isinstance(value, np.ndarray):
+            raise ValueError("Must return ndarray")
+        value = value.astype(dtype=np.float32)
+        shape = self.get_shape(key)
+        if value.shape != shape:
+            raise ValueError("Value shape mismatch, expect %s, should %s" %
+                             (shape, value.shape))
+
+        if len(self.__gradient_machines__) == 0:
+            self.__tmp_params__.append((key, value))
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                __copy_parameter_to_gradient_machine__(each_gradient_machine,
+                                                       key, value)
+
+    def get(self, parameter_name):
+        """
+        Get parameter by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :return: The parameter matrix.
+        :rtype: np.ndarray
+        """
+        return self.__getitem__(key=parameter_name)
+
+    def set(self, parameter_name, value):
+        """
+        Set parameter by parameter name & matrix.
+
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :param value: parameter matrix
+        :type value: np.ndarray
+        :return: Nothing.
+        """
+        self.__setitem__(key=parameter_name, value=value)
+
+    def append_gradient_machine(self, gradient_machine):
+        """
+        append gradient machine to parameters. This method is used internally in
+        Trainer.train.
+
+        :param gradient_machine: Paddle C++ GradientMachine object.
+        :type gradient_machine: api.GradientMachine
+        :return:
+        """
+
+        if not isinstance(gradient_machine, api.GradientMachine):
+            raise ValueError("gradient_machine should be api.GradientMachine")
+
+        if len(self.__tmp_params__) != 0:
+            for name, val in self.__tmp_params__:
+                try:
+                    __copy_parameter_to_gradient_machine__(gradient_machine,
+                                                           name, val)
+                except ValueError:
+                    # If no such parameter in gradient machine, then don't copy
+                    pass
+
+        self.__gradient_machines__.append(gradient_machine)
+
+    def serialize(self, name, f):
+        """
+
+        :param name:
+        :param f:
+        :type f: file
+        :return:
+        """
+        param = self.get(name)
+        size = reduce(lambda a, b: a * b, param.shape)
+        f.write(struct.pack("IIQ", 0, 4, size))
+        param = param.astype(np.float32)
+        f.write(param.tobytes())
+
+    def deserialize(self, name, f):
+        """
+
+        :param name:
+        :param f:
+        :type f: file
+        :return:
+        """
+        f.read(16)  # header
+        arr = np.frombuffer(f.read(), dtype=np.float32)
+        self.set(name, arr.reshape(self.get_shape(name)))
+
+    def to_tar(self, f):
+        tar = tarfile.TarFile(fileobj=f, mode='w')
+        for nm in self.names():
+            buf = cStringIO.StringIO()
+            self.serialize(nm, buf)
+            tarinfo = tarfile.TarInfo(name=nm)
+            buf.seek(0)
+            tarinfo.size = len(buf.getvalue())
+            tar.addfile(tarinfo, buf)
+
+            conf = self.__param_conf__[nm]
+            confStr = conf.SerializeToString()
+            tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm)
+            tarinfo.size = len(confStr)
+            buf = cStringIO.StringIO(confStr)
+            buf.seek(0)
+            tar.addfile(tarinfo, fileobj=buf)
+
+    @staticmethod
+    def from_tar(f):
+        params = Parameters()
+        tar = tarfile.TarFile(fileobj=f, mode='r')
+        for finfo in tar:
+            assert isinstance(finfo, tarfile.TarInfo)
+            if finfo.name.endswith('.protobuf'):
+                f = tar.extractfile(finfo)
+                conf = ParameterConfig()
+                conf.ParseFromString(f.read())
+                params.__append_config__(conf)
+
+        for param_name in params.names():
+            f = tar.extractfile(param_name)
+            params.deserialize(param_name, f)
+        return params
+
+
+def __get_parameter_in_gradient_machine__(gradient_machine, name):
+    """
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :return:
+    :rtype: api.Parameter
+    """
+    params = filter(lambda p: p.getName() == name,
+                    gradient_machine.getParameters())
+
+    if len(params) == 0:
+        raise ValueError("No such parameter")
+    elif len(params) > 1:
+        raise ValueError("Unexpected branch")
+    else:
+        return params[0]
+
+
+def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
+    """
+    Copy a python ndarray into the gradient machine.
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :param arr:
+    :type arr: np.ndarray
+    :return:
+    :rtype: api.Parameter
+    """
+    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
+    vec = param.getBuf(api.PARAMETER_VALUE)
+    assert isinstance(vec, api.Vector)
+    vec.copyFromNumpyArray(arr.flatten())
diff --git a/python/paddle/v2/pooling.py b/python/paddle/v2/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..4881c27d1d6d3d926f12aab096f377164debf1ef
--- /dev/null
+++ b/python/paddle/v2/pooling.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.poolings
+import copy
+
+__all__ = []
+suffix = 'Pooling'
+
+for name in paddle.trainer_config_helpers.poolings.__all__:
+    new_name = name[:-len(suffix)]
+    globals()[new_name] = copy.copy(
+        getattr(paddle.trainer_config_helpers.poolings, name))
+    globals()[new_name].__name__ = new_name
+    __all__.append(new_name)
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/v2/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..07142056f872db5113acdd296b17c52b343c1be6
--- /dev/null
+++ b/python/paddle/v2/reader/creator.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could be used in user
+program.
+"""
+
+__all__ = ['np_array', 'text_file']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..104ce9a0411413bb8fc65eedf5821f98d6acdba3
--- /dev/null
+++ b/python/paddle/v2/reader/decorator.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn'
+]
+
+import itertools
+import random
+from Queue import Queue
+from threading import Thread
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+    
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+    
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a51f700406b48f8186e45f1ced94765e343a8b5e
--- /dev/null
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test(NAME reader_tests
+  COMMAND bash ${PROJ_ROOT}/python/paddle/v2/reader/tests/run_tests.sh
+  ${PYTHON_EXECUTABLE})
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8d7133b8694aae5541eff9576eaba8a31e77dc
--- /dev/null
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -0,0 +1,40 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+
+import numpy as np
+
+import paddle.v2.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.v2.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.v2.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..734154b9790a4dc118d11992343648364c907305
--- /dev/null
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -0,0 +1,125 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import unittest
+
+import paddle.v2.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.v2.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.v2.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.v2.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/run_tests.sh b/python/paddle/v2/reader/tests/run_tests.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a544a5636021bcf8bd9a35966c91ae343c149d14
--- /dev/null
+++ b/python/paddle/v2/reader/tests/run_tests.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+$1 -m pip install ../../../../../paddle/dist/*.whl
+
+test_list="creator_test.py decorator_test.py"
+
+export PYTHONPATH=$PWD/../../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..572deaff356712cac23cd7911cdf289db100564c
--- /dev/null
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_test(NAME test_v2_api
+        COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE})
+
+add_test(NAME test_v2_layer
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_layer.py
+        WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
+add_test(NAME test_v2_rnn_layer
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_rnn_layer.py)
+
+add_test(NAME test_topology
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_topology.py
+        WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dda1b1bd222a9f226db1a4bd730e9637ab882196
--- /dev/null
+++ b/python/paddle/v2/tests/run_tests.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+
+$1 -m pip install ../../../../paddle/dist/*.whl
+
+test_list="test_data_feeder.py test_parameters.py"
+
+export PYTHONPATH=$PWD/../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..71eb3bf31425c22b47accc11c9550042e077ef12
--- /dev/null
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import py_paddle.swig_paddle as api
+import numpy as np
+
+from paddle.v2 import data_type
+from paddle.v2.data_feeder import DataFeeder
+
+
+class DataFeederTest(unittest.TestCase):
+    def dense_reader(self, size):
+        data = np.random.random(size)
+        return data
+
+    def sparse_binary_reader(self, high, size_limit, non_empty=False):
+        num = np.random.randint(size_limit)  # num could be 0
+        while non_empty and num == 0:
+            num = np.random.randint(size_limit)
+        return np.random.randint(high, size=num).tolist()
+
+    def test_dense(self):
+        def compare(input):
+            feeder = DataFeeder([('image', data_type.dense_vector(784))],
+                                {'image': 0})
+            arg = feeder(input)
+            output = arg.getSlotValue(0).copyToNumpyMat()
+            input = np.array(input, dtype='float32')
+            self.assertAlmostEqual(input.all(), output.all())
+
+        # test numpy array
+        batch_size = 32
+        dim = 784
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim))
+            data.append(each_sample)
+        compare(data)
+
+        # each feature is a list
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim).tolist())
+            data.append(each_sample)
+        compare(data)
+
+        # test tuple
+        data = []
+        for i in xrange(batch_size):
+            each_sample = (self.dense_reader(dim).tolist(), )
+            data.append(each_sample)
+        compare(data)
+
+    def test_sparse_binary(self):
+        dim = 10000
+        batch_size = 32
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.sparse_binary_reader(dim, 50))
+            data.append(each_sample)
+        feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), data[i][0])
+
+    def test_sparse(self):
+        dim = 10000
+        batch_size = 32
+        v = []
+        w = []
+        data = []
+        for dat in xrange(batch_size):
+            each_sample = []
+            a = self.sparse_binary_reader(dim, 40, non_empty=True)
+            b = self.dense_reader(len(a)).tolist()
+            v.append(a)
+            w.append(np.array(b, dtype="float32"))
+            each_sample.append(zip(a, b))
+            data.append(each_sample)
+
+        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), v[i])
+            cols_value = output.getSparseRowColsVal(i)
+            value = [val[1] for val in cols_value]
+            value = np.array(value, dtype="float32")
+            self.assertAlmostEqual(value.all(), w[i].all())
+
+    def test_integer(self):
+        value_range = 100
+        batch_size = 32
+        index = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(value_range))
+            index.append(each_sample)
+        feeder = DataFeeder([('input', data_type.integer_value(value_range))],
+                            {'input': 0})
+        arg = feeder(index)
+        output = arg.getSlotIds(0).copyToNumpyArray()
+        index = np.array(index, dtype='int')
+        self.assertEqual(output.all(), index.flatten().all())
+
+    def test_integer_sequence(self):
+        value_range = 10000
+        batch_size = 32
+        start = [0]
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(
+                self.sparse_binary_reader(
+                    value_range, 30, non_empty=True))
+            data.append(each_sample)
+            start.append(len(each_sample[0]) + start[-1])
+        feeder = DataFeeder(
+            [('input', data_type.integer_value_sequence(value_range))],
+            {'input': 0})
+        arg = feeder(data)
+        output_data = arg.getSlotIds(0).copyToNumpyArray()
+        output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray()
+
+        index = []
+        for dat in data:
+            index.extend(x for x in dat[0])  # only one feature, so dat[0]
+        index = np.array(index, dtype='int')
+        start = np.array(start, dtype='int')
+        self.assertEqual(output_data.all(), index.all())
+        self.assertEqual(output_start.all(), start.all())
+
+    def test_multiple_features(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(10))
+            each_sample.append(
+                self.sparse_binary_reader(
+                    20000, 40, non_empty=True))
+            each_sample.append(self.dense_reader(100))
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_sparse = arg.getSlotValue(1)
+        output_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 features, but only use 2 features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_index = arg.getSlotIds(1).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 featreus, one is duplicate data
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10)),
+                      ('fea3', data_type.dense_vector(100))]
+        feeder = DataFeeder(data_types,
+                            {'fea0': 2,
+                             'fea1': 1,
+                             'fea2': 0,
+                             'fea3': 2})
+        arg = feeder(data)
+        fea0 = arg.getSlotValue(0).copyToNumpyMat()
+        fea1 = arg.getSlotValue(1)
+        fea2 = arg.getSlotIds(2).copyToNumpyArray()
+        fea3 = arg.getSlotValue(3).copyToNumpyMat()
+        for i in xrange(batch_size):
+            self.assertEqual(fea0[i].all(), data[i][2].all())
+            self.assertEqual(fea1.getSparseRowCols(i), data[i][1])
+            self.assertEqual(fea2[i], data[i][0])
+            self.assertEqual(fea3[i].all(), data[i][2].all())
+
+    def test_multiple_features_tuple(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            a = np.random.randint(10)
+            b = self.sparse_binary_reader(20000, 40, non_empty=True)
+            c = self.dense_reader(100)
+            each_sample = (a, b, c)
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        out_dense = arg.getSlotValue(0).copyToNumpyMat()
+        out_sparse = arg.getSlotValue(1)
+        out_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(out_dense[i].all(), data[i][2].all())
+            self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(out_index[i], data[i][0])
+
+
+if __name__ == '__main__':
+    api.initPaddle("--use_gpu=0")
+    suite = unittest.TestLoader().loadTestsFromTestCase(DataFeederTest)
+    unittest.TextTestRunner().run(suite)
+    if api.isGpuVersion():
+        api.setUseGpu(True)
+        unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0055679a91801a2f9b6432797665ec17caf3beb1
--- /dev/null
+++ b/python/paddle/v2/tests/test_layer.py
@@ -0,0 +1,264 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import paddle.v2.activation as activation
+import paddle.v2.attr as attr
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+import paddle.v2.pooling as pooling
+import paddle.v2.networks as networks
+
+pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
+label = layer.data(name='label', type=data_type.integer_value(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(10))
+score = layer.data(name='score', type=data_type.dense_vector(1))
+
+hidden = layer.fc(input=pixel,
+                  size=100,
+                  act=activation.Sigmoid(),
+                  param_attr=attr.Param(name='hidden'))
+inference = layer.fc(input=hidden, size=10, act=activation.Softmax())
+conv = layer.img_conv(
+    input=pixel,
+    filter_size=1,
+    filter_size_y=1,
+    num_channels=8,
+    num_filters=16,
+    act=activation.Linear())
+
+
+class ImageLayerTest(unittest.TestCase):
+    def test_conv_layer(self):
+        conv_shift = layer.conv_shift(a=pixel, b=score)
+        print layer.parse_network(conv, conv_shift)
+
+    def test_pooling_layer(self):
+        maxpool = layer.img_pool(
+            input=conv,
+            pool_size=2,
+            num_channels=16,
+            padding=1,
+            pool_type=pooling.Max())
+        spp = layer.spp(input=conv,
+                        pyramid_height=2,
+                        num_channels=16,
+                        pool_type=pooling.Max())
+        maxout = layer.maxout(input=conv, num_channels=16, groups=4)
+        print layer.parse_network(maxpool, spp, maxout)
+
+    def test_norm_layer(self):
+        norm1 = layer.img_cmrnorm(input=conv, size=5)
+        norm2 = layer.batch_norm(input=conv)
+        norm3 = layer.sum_to_one_norm(input=conv)
+        print layer.parse_network(norm1, norm2, norm3)
+
+
+class AggregateLayerTest(unittest.TestCase):
+    def test_aggregate_layer(self):
+        pool = layer.pooling(
+            input=pixel,
+            pooling_type=pooling.Avg(),
+            agg_level=layer.AggregateLevel.EACH_SEQUENCE)
+        last_seq = layer.last_seq(input=pixel)
+        first_seq = layer.first_seq(input=pixel)
+        concat = layer.concat(input=[last_seq, first_seq])
+        seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
+        print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat)
+
+
+class MathLayerTest(unittest.TestCase):
+    def test_math_layer(self):
+        addto = layer.addto(input=[pixel, pixel])
+        linear_comb = layer.linear_comb(weights=weight, vectors=hidden, size=10)
+        interpolation = layer.interpolation(
+            input=[hidden, hidden], weight=score)
+        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
+        power = layer.power(input=pixel, weight=score)
+        scaling = layer.scaling(input=pixel, weight=score)
+        slope = layer.slope_intercept(input=pixel)
+        tensor = layer.tensor(a=pixel, b=pixel, size=1000)
+        cos_sim = layer.cos_sim(a=pixel, b=pixel)
+        trans = layer.trans(input=tensor)
+        print layer.parse_network(addto, linear_comb, interpolation, power,
+                                  scaling, slope, tensor, cos_sim, trans)
+
+
+class ReshapeLayerTest(unittest.TestCase):
+    def test_reshape_layer(self):
+        block_expand = layer.block_expand(
+            input=conv, num_channels=4, stride_x=1, block_x=1)
+        expand = layer.expand(
+            input=weight,
+            expand_as=pixel,
+            expand_level=layer.ExpandLevel.FROM_TIMESTEP)
+        repeat = layer.repeat(input=pixel, num_repeats=4)
+        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
+        rotate = layer.rotate(input=pixel, height=16, width=49)
+        print layer.parse_network(block_expand, expand, repeat, reshape, rotate)
+
+
+class RecurrentLayerTest(unittest.TestCase):
+    def test_recurrent_layer(self):
+        word = layer.data(name='word', type=data_type.integer_value(12))
+        recurrent = layer.recurrent(input=word)
+        lstm = layer.lstmemory(input=word)
+        gru = layer.grumemory(input=word)
+        print layer.parse_network(recurrent, lstm, gru)
+
+
+class CostLayerTest(unittest.TestCase):
+    def test_cost_layer(self):
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.classification_cost(
+            input=inference, label=label, weight=weight)
+        cost3 = layer.cross_entropy_cost(input=inference, label=label)
+        cost4 = layer.cross_entropy_with_selfnorm_cost(
+            input=inference, label=label)
+        cost5 = layer.regression_cost(input=inference, label=label)
+        cost6 = layer.regression_cost(
+            input=inference, label=label, weight=weight)
+        cost7 = layer.multi_binary_label_cross_entropy_cost(
+            input=inference, label=label)
+        cost8 = layer.rank_cost(left=score, right=score, label=score)
+        cost9 = layer.lambda_cost(input=inference, score=score)
+        cost10 = layer.sum_cost(input=inference)
+        cost11 = layer.huber_cost(input=score, label=label)
+
+        print layer.parse_network(cost1, cost2)
+        print layer.parse_network(cost3, cost4)
+        print layer.parse_network(cost5, cost6)
+        print layer.parse_network(cost7, cost8, cost9, cost10, cost11)
+
+        crf = layer.crf(input=inference, label=label)
+        crf_decoding = layer.crf_decoding(input=inference, size=3)
+        ctc = layer.ctc(input=inference, label=label)
+        warp_ctc = layer.warp_ctc(input=pixel, label=label)
+        nce = layer.nce(input=inference, label=label, num_classes=3)
+        hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
+
+        print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce,
+                                  hsigmoid)
+
+
+class OtherLayerTest(unittest.TestCase):
+    def test_sampling_layer(self):
+        maxid = layer.max_id(input=inference)
+        sampling_id = layer.sampling_id(input=inference)
+        eos = layer.eos(input=maxid, eos_id=5)
+        print layer.parse_network(maxid, sampling_id, eos)
+
+    def test_slicing_joining_layer(self):
+        pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+        print layer.parse_network(pad)
+
+
+class ProjOpTest(unittest.TestCase):
+    def test_projection(self):
+        input = layer.data(name='data', type=data_type.dense_vector(784))
+        word = layer.data(
+            name='word', type=data_type.integer_value_sequence(10000))
+        fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
+        mixed0 = layer.mixed(
+            size=256,
+            input=[
+                layer.full_matrix_projection(input=fc0),
+                layer.full_matrix_projection(input=fc1)
+            ])
+        with layer.mixed(size=200) as mixed1:
+            mixed1 += layer.full_matrix_projection(input=fc0)
+            mixed1 += layer.identity_projection(input=fc1)
+
+        table = layer.table_projection(input=word)
+        emb0 = layer.mixed(size=512, input=table)
+        with layer.mixed(size=512) as emb1:
+            emb1 += table
+
+        scale = layer.scaling_projection(input=fc0)
+        scale0 = layer.mixed(size=100, input=scale)
+        with layer.mixed(size=100) as scale1:
+            scale1 += scale
+
+        dotmul = layer.dotmul_projection(input=fc0)
+        dotmul0 = layer.mixed(size=100, input=dotmul)
+        with layer.mixed(size=100) as dotmul1:
+            dotmul1 += dotmul
+
+        context = layer.context_projection(input=fc0, context_len=5)
+        context0 = layer.mixed(size=100, input=context)
+        with layer.mixed(size=100) as context1:
+            context1 += context
+
+        conv = layer.conv_projection(
+            input=input,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv, bias_attr=True)
+        with layer.mixed(bias_attr=True) as conv1:
+            conv1 += conv
+
+        print layer.parse_network(mixed0)
+        print layer.parse_network(mixed1)
+        print layer.parse_network(emb0)
+        print layer.parse_network(emb1)
+        print layer.parse_network(scale0)
+        print layer.parse_network(scale1)
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+    def test_operator(self):
+        ipt0 = layer.data(name='data', type=data_type.dense_vector(784))
+        ipt1 = layer.data(name='word', type=data_type.dense_vector(128))
+        fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+
+        dotmul_op = layer.dotmul_operator(a=fc0, b=fc1)
+        dotmul0 = layer.mixed(input=dotmul_op)
+        with layer.mixed() as dotmul1:
+            dotmul1 += dotmul_op
+
+        conv = layer.conv_operator(
+            img=ipt0,
+            filter=ipt1,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv)
+        with layer.mixed() as conv1:
+            conv1 += conv
+
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+
+class NetworkTests(unittest.TestCase):
+    def test_vgg(self):
+        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        vgg_out = networks.small_vgg(
+            input_image=img, num_channels=1, num_classes=2)
+        print layer.parse_network(vgg_out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebb182caab6430862a8e4da2ae4ea6b1e72f726c
--- /dev/null
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -0,0 +1,60 @@
+import unittest
+import sys
+
+try:
+    import py_paddle
+
+    del py_paddle
+except ImportError:
+    print >> sys.stderr, "It seems swig of Paddle is not installed, this " \
+                         "unittest will not be run."
+    sys.exit(0)
+
+import paddle.v2.parameters as parameters
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import random
+import cStringIO
+import numpy
+
+
+def __rand_param_config__(name):
+    conf = ParameterConfig()
+    conf.name = name
+    size = 1
+    for i in xrange(2):
+        dim = random.randint(1, 1000)
+        conf.dims.append(dim)
+        size *= dim
+    conf.size = size
+    assert conf.IsInitialized()
+    return conf
+
+
+class TestParameters(unittest.TestCase):
+    def test_serialization(self):
+        params = parameters.Parameters()
+        params.__append_config__(__rand_param_config__("param_0"))
+        params.__append_config__(__rand_param_config__("param_1"))
+
+        for name in params.names():
+            param = params.get(name)
+            param[:] = numpy.random.uniform(
+                -1.0, 1.0, size=params.get_shape(name))
+            params.set(name, param)
+
+        tmp_file = cStringIO.StringIO()
+        params.to_tar(tmp_file)
+        tmp_file.seek(0)
+        params_dup = parameters.Parameters.from_tar(tmp_file)
+
+        self.assertEqual(params_dup.names(), params.names())
+
+        for name in params.names():
+            self.assertEqual(params.get_shape(name), params_dup.get_shape(name))
+            p0 = params.get(name)
+            p1 = params_dup.get(name)
+            self.assertTrue(numpy.isclose(p0, p1).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbbd20eb76bb9daab2bcf98c4adad989106a377
--- /dev/null
+++ b/python/paddle/v2/tests/test_rnn_layer.py
@@ -0,0 +1,155 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import difflib
+import unittest
+
+import paddle.trainer_config_helpers as conf_helps
+import paddle.v2.activation as activation
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as parse_network
+
+
+class RNNTest(unittest.TestCase):
+    def test_simple_rnn(self):
+        dict_dim = 10
+        word_dim = 8
+        hidden_dim = 8
+
+        def parse_old_rnn():
+            def step(y):
+                mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
+                out = conf_helps.fc_layer(
+                    input=[y, mem],
+                    size=hidden_dim,
+                    act=activation.Tanh(),
+                    bias_attr=True,
+                    name="rnn_state")
+                return out
+
+            def test():
+                data = conf_helps.data_layer(name="word", size=dict_dim)
+                embd = conf_helps.embedding_layer(input=data, size=word_dim)
+                conf_helps.recurrent_group(name="rnn", step=step, input=embd)
+
+            return str(parse_network(test))
+
+        def parse_new_rnn():
+            def new_step(y):
+                mem = layer.memory(name="rnn_state", size=hidden_dim)
+                out = layer.fc(input=[y, mem],
+                               size=hidden_dim,
+                               act=activation.Tanh(),
+                               bias_attr=True,
+                               name="rnn_state")
+                return out
+
+            data = layer.data(
+                name="word", type=data_type.integer_value(dict_dim))
+            embd = layer.embedding(input=data, size=word_dim)
+            rnn_layer = layer.recurrent_group(
+                name="rnn", step=new_step, input=embd)
+            return str(layer.parse_network(rnn_layer))
+
+        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
+                                    parse_new_rnn().splitlines(1))
+        print ''.join(diff)
+
+    def test_sequence_rnn_multi_input(self):
+        dict_dim = 10
+        word_dim = 8
+        hidden_dim = 8
+        label_dim = 3
+
+        def parse_old_rnn():
+            def test():
+                data = conf_helps.data_layer(name="word", size=dict_dim)
+                label = conf_helps.data_layer(name="label", size=label_dim)
+                emb = conf_helps.embedding_layer(input=data, size=word_dim)
+                boot_layer = conf_helps.data_layer(name="boot", size=10)
+                boot_layer = conf_helps.fc_layer(
+                    name='boot_fc', input=boot_layer, size=10)
+
+                def step(y, wid):
+                    z = conf_helps.embedding_layer(input=wid, size=word_dim)
+                    mem = conf_helps.memory(
+                        name="rnn_state",
+                        size=hidden_dim,
+                        boot_layer=boot_layer)
+                    out = conf_helps.fc_layer(
+                        input=[y, z, mem],
+                        size=hidden_dim,
+                        act=conf_helps.TanhActivation(),
+                        bias_attr=True,
+                        name="rnn_state")
+                    return out
+
+                out = conf_helps.recurrent_group(
+                    name="rnn", step=step, input=[emb, data])
+
+                rep = conf_helps.last_seq(input=out)
+                prob = conf_helps.fc_layer(
+                    size=label_dim,
+                    input=rep,
+                    act=conf_helps.SoftmaxActivation(),
+                    bias_attr=True)
+
+                conf_helps.outputs(
+                    conf_helps.classification_cost(
+                        input=prob, label=label))
+
+            return str(parse_network(test))
+
+        def parse_new_rnn():
+            data = layer.data(
+                name="word", type=data_type.dense_vector(dict_dim))
+            label = layer.data(
+                name="label", type=data_type.dense_vector(label_dim))
+            emb = layer.embedding(input=data, size=word_dim)
+            boot_layer = layer.data(
+                name="boot", type=data_type.dense_vector(10))
+            boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10)
+
+            def step(y, wid):
+                z = layer.embedding(input=wid, size=word_dim)
+                mem = layer.memory(
+                    name="rnn_state", size=hidden_dim, boot_layer=boot_layer)
+                out = layer.fc(input=[y, z, mem],
+                               size=hidden_dim,
+                               act=activation.Tanh(),
+                               bias_attr=True,
+                               name="rnn_state")
+                return out
+
+            out = layer.recurrent_group(
+                name="rnn", step=step, input=[emb, data])
+
+            rep = layer.last_seq(input=out)
+            prob = layer.fc(size=label_dim,
+                            input=rep,
+                            act=activation.Softmax(),
+                            bias_attr=True)
+
+            cost = layer.classification_cost(input=prob, label=label)
+
+            return str(layer.parse_network(cost))
+
+        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
+                                    parse_new_rnn().splitlines(1))
+        print ''.join(diff)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c6dbcdb4f49b960fb8b71aecbad4f013d2cd283
--- /dev/null
+++ b/python/paddle/v2/tests/test_topology.py
@@ -0,0 +1,84 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle.v2.layer as layer
+import paddle.v2.topology as topology
+import paddle.v2.data_type as data_type
+import paddle.trainer_config_helpers as conf_helps
+import paddle.trainer.PyDataProvider2 as pydp2
+
+
+class TestTopology(unittest.TestCase):
+    def test_data_type(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        data_types = topo.data_type()
+        self.assertEqual(len(data_types), 2)
+        pixel_data_type = filter(lambda type: type[0] == "pixel", data_types)
+        self.assertEqual(len(pixel_data_type), 1)
+        pixel_data_type = pixel_data_type[0]
+        self.assertEqual(pixel_data_type[1].type, pydp2.DataType.Dense)
+        self.assertEqual(pixel_data_type[1].dim, 784)
+
+        label_data_type = filter(lambda type: type[0] == "label", data_types)
+        self.assertEqual(len(label_data_type), 1)
+        label_data_type = label_data_type[0]
+        self.assertEqual(label_data_type[1].type, pydp2.DataType.Index)
+        self.assertEqual(label_data_type[1].dim, 10)
+
+    def test_get_layer(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        pixel_layer = topo.get_layer("pixel")
+        label_layer = topo.get_layer("label")
+        self.assertEqual(pixel_layer, pixel)
+        self.assertEqual(label_layer, label)
+
+    def test_parse(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        maxid = layer.max_id(input=inference)
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.cross_entropy_cost(input=inference, label=label)
+
+        topology.Topology(cost2).proto()
+        topology.Topology([cost1]).proto()
+        topology.Topology([cost1, cost2]).proto()
+        topology.Topology([inference, maxid]).proto()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0679c5675b0c0f24f28f3df22efd4eb51ccbb3a
--- /dev/null
+++ b/python/paddle/v2/topology.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+from paddle.proto.ModelConfig_pb2 import ModelConfig
+
+import layer as v2_layer
+from layer import WithExtraParent
+
+__all__ = ['Topology']
+
+
+def __flatten__(lis):
+    """
+    Given a list, possibly nested to any level, return it flattened.
+    """
+    new_lis = []
+    for item in lis:
+        if isinstance(item, collections.Sequence):
+            new_lis.extend(__flatten__(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+
+
+def __bfs_travel__(callback, *layers):
+    layers = __flatten__(layers)
+    for each_layer in layers:
+        __break__ = callback(each_layer)
+        if __break__:
+            return
+        __layers__ = each_layer.__parent_layers__.values()
+        if isinstance(each_layer, WithExtraParent):
+            __layers__ = __layers__ + each_layer.extra_parent()
+        __bfs_travel__(callback, *__layers__)
+
+
+class Topology(object):
+    """
+    Topology is used to store the information about all layers
+    and network configs.
+    """
+
+    def __init__(self, layers):
+        if not isinstance(layers, collections.Sequence):
+            __check_layer_type__(layers)
+            layers = [layers]
+        for layer in layers:
+            __check_layer_type__(layer)
+        self.layers = layers
+        self.__model_config__ = v2_layer.parse_network(*layers)
+        assert isinstance(self.__model_config__, ModelConfig)
+
+    def proto(self):
+        return self.__model_config__
+
+    def get_layer(self, name):
+        """
+        get v2.Layer Class instance by layer name
+        :param name:
+        :return:
+        """
+        result_layer = [None]
+
+        def __impl__(l):
+            if l.name == name:
+                result_layer[0] = l
+                return True  # break
+            return False
+
+        __bfs_travel__(__impl__, *self.layers)
+        if result_layer[0] is None:
+            raise ValueError("No such layer %s" % name)
+        return result_layer[0]
+
+    def data_layers(self):
+        """
+        get all data layer
+        :return:
+        """
+        data_layers = dict()
+
+        def __impl__(l):
+            if isinstance(l, v2_layer.DataLayerV2):
+                data_layers[l.name] = l
+
+        __bfs_travel__(__impl__, *self.layers)
+        return data_layers
+
+    def data_type(self):
+        """
+        get data_type from proto, such as:
+        [('image', dense_vector(768)), ('label', integer_value(10))]
+        """
+        data_layers = self.data_layers()
+        return [(nm, data_layers[nm].type)
+                for nm in self.proto().input_layer_names]
+
+
+def __check_layer_type__(layer):
+    if not isinstance(layer, v2_layer.LayerV2):
+        raise ValueError('layer should have type paddle.layer.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd3e2c565ee00c91402e7dea36c7393fb1a9bdf
--- /dev/null
+++ b/python/paddle/v2/trainer.py
@@ -0,0 +1,152 @@
+import collections
+
+import py_paddle.swig_paddle as api
+
+from data_feeder import DataFeeder
+from topology import Topology
+from . import event as v2_event
+from . import optimizer as v2_optimizer
+from . import parameters as v2_parameters
+
+__all__ = ['SGD']
+"""
+Trainer package
+TODO(yuyang18): Complete comments.
+"""
+
+
+def default_event_handler(event):
+    """
+    Default event handler. It will print some log and save mode.
+
+    TODO(yuyang18): Complete it!
+    :param event:
+    :return:
+    """
+    pass
+
+
+class SGD(object):
+    """
+    Simple SGD Trainer.
+    TODO(yuyang18): Complete comments
+
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
+    :param cost: Target cost that neural network should be optimized.
+    :type cost: paddle.v2.config_base.Layer
+    :param parameters: The parameters dictionary.
+    :type parameters: paddle.v2.parameters.Parameters
+    """
+
+    def __init__(self, cost, parameters, update_equation):
+
+        if not isinstance(parameters, v2_parameters.Parameters):
+            raise TypeError('parameters should be parameters')
+
+        if not isinstance(update_equation, v2_optimizer.Optimizer):
+            raise TypeError("update equation parameter must be "
+                            "paddle.v2.optimizer.Optimizer")
+        topology = Topology(cost)
+        self.__optimizer__ = update_equation
+        self.__topology__ = topology
+        self.__parameters__ = parameters
+        self.__topology_in_proto__ = topology.proto()
+        self.__data_types__ = topology.data_type()
+        gm = api.GradientMachine.createFromConfigProto(
+            self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
+            self.__optimizer__.enable_types())
+        assert isinstance(gm, api.GradientMachine)
+        self.__gradient_machine__ = gm
+        self.__gradient_machine__.randParameters()
+        parameters.append_gradient_machine(gm)
+
+    def train(self, reader, num_passes=1, event_handler=None, feeding=None):
+        """
+        Training method. Will train num_passes of input data.
+
+        :param reader:
+        :param num_passes: The total train passes.
+        :param event_handler: Event handler. A method will be invoked when event
+                              occurred.
+        :type event_handler: (BaseEvent) => None
+        :param feeding: Feeding is a map of neural network input name and array
+                        index that reader returns.
+        :type feeding: dict
+        :return:
+        """
+        if event_handler is None:
+            event_handler = default_event_handler
+        __check_train_args__(**locals())
+
+        updater = self.__optimizer__.create_local_updater()
+        updater.init(self.__gradient_machine__)
+
+        self.__gradient_machine__.start()
+        batch_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(batch_evaluator, api.Evaluator)
+        pass_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(pass_evaluator, api.Evaluator)
+        out_args = api.Arguments.createArguments(0)
+        feeder = DataFeeder(self.__data_types__, feeding)
+        for pass_id in xrange(num_passes):
+            event_handler(v2_event.BeginPass(pass_id))
+            pass_evaluator.start()
+            updater.startPass()
+            for batch_id, data_batch in enumerate(reader()):
+                batch_evaluator.start()
+                event_handler(
+                    v2_event.BeginIteration(
+                        pass_id=pass_id, batch_id=batch_id))
+                pass_type = updater.startBatch(len(data_batch))
+                self.__gradient_machine__.forwardBackward(
+                    feeder(data_batch), out_args, pass_type)
+                self.__gradient_machine__.eval(pass_evaluator)
+                self.__gradient_machine__.eval(batch_evaluator)
+                for each_param in self.__gradient_machine__.getNonStaticParameters(
+                ):
+                    updater.update(each_param)
+                cost_sum = out_args.sum()
+                cost = cost_sum / len(data_batch)
+                updater.finishBatch(cost)
+                batch_evaluator.finish()
+                event_handler(
+                    v2_event.EndIteration(
+                        pass_id=pass_id,
+                        batch_id=batch_id,
+                        cost=cost,
+                        evaluator=batch_evaluator))
+
+            updater.finishPass()
+            pass_evaluator.finish()
+            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+        self.__gradient_machine__.finish()
+
+    def test(self, reader, feeding=None):
+        feeder = DataFeeder(self.__data_types__, feeding)
+        evaluator = self.__gradient_machine__.makeEvaluator()
+        out_args = api.Arguments.createArguments(0)
+        evaluator.start()
+        total_cost = 0
+        num_samples = 0.0
+        for data_batch in reader():
+            num_samples += len(data_batch)
+            self.__gradient_machine__.forward(
+                feeder(data_batch), out_args, api.PASS_TEST)
+            total_cost += out_args.sum()
+            self.__gradient_machine__.eval(evaluator)
+
+        evaluator.finish()
+        return v2_event.TestResult(
+            evaluator=evaluator, cost=total_cost / num_samples)
+
+
+def __check_train_args__(reader, event_handler, **kwargs):
+    """
+    Check train function's argument types
+    """
+    if not callable(reader) or not isinstance(reader(), collections.Iterator):
+        raise TypeError('train_data_reader should be a function, '
+                        'which can return a iterator')
+    if not callable(event_handler):
+        raise TypeError('event handler should be a function')
diff --git a/python/setup.py.in b/python/setup.py.in
index d2fb95f27ff2f0673050e699316dde504dbf28f6..68ca35265cf13265ad0b171b0f70e20b83006ff9 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,15 +1,13 @@
 from setuptools import setup
 
-INTERNAL_PACKAGE='${PADDLE_INTERNAL_PACKAGE}'
-
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
-          'paddle.utils']
-
-if len(INTERNAL_PACKAGE) != 0:
-    packages.append(INTERNAL_PACKAGE)
+          'paddle.utils',
+          'paddle.v2',
+          'paddle.v2.dataset',
+          'paddle.v2.reader']
 
 setup(name='paddle',
       version='${PADDLE_VERSION}',
diff --git a/warp-ctc b/warp-ctc
deleted file mode 160000
index bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2..0000000000000000000000000000000000000000
--- a/warp-ctc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2